What Are Regular Expressions?
Regular Expressions (Regex) are a tool for finding or replacing specific patterns in strings. Think of it as using a metal detector to find specific items in a pile of text. Once learned, regex works the same across any programming language, making it one of the highest-ROI skills you can invest in.
Basic Syntax
Metacharacters
| Character | Meaning | Example | Matches |
|---|---|---|---|
. | Any single character | a.c | abc, a1c, a-c |
\d | Digit [0-9] | \d{3} | 123, 456 |
\w | Alphanumeric + underscore [a-zA-Z0-9_] | \w+ | hello, user_1 |
\s | Whitespace (space, tab, newline) | a\sb | a b |
\D | Non-digit character | \D+ | hello, --- |
\W | Non-alphanumeric character | \W | @, #, ! |
^ | Start of string | ^Hello | Hello… |
$ | End of string | world$ | …world |
\b | Word boundary | \bcat\b | cat (not category) |
Quantifiers
| Quantifier | Meaning | Example | Matches |
|---|---|---|---|
* | 0 or more | ab*c | ac, abc, abbc |
+ | 1 or more | ab+c | abc, abbc (not ac) |
? | 0 or 1 | colou?r | color, colour |
{n} | Exactly n | \d{4} | 2026 |
{n,} | n or more | \d{2,} | 12, 123, 1234 |
{n,m} | Between n and m | \d{2,4} | 12, 123, 1234 |
Character Classes
import re
# Character class — matches one character from inside the brackets
pattern_vowel = r"[aeiou]" # Vowels
pattern_hex = r"[0-9a-fA-F]" # Hexadecimal characters
pattern_not_digit = r"[^0-9]" # Non-digit characters (^ = negation)
text = "Hello World 123"
vowels = re.findall(pattern_vowel, text)
print(vowels) # Output: ['e', 'o', 'o']
non_digits = re.findall(pattern_not_digit, text)
print(non_digits) # Output: ['H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd', ' ']
Groups and Capturing
Wrapping a pattern in parentheses () lets you extract matched parts separately.
import re
# Extract year, month, day from a date
date_pattern = r"(\d{4})-(\d{2})-(\d{2})"
text = "Today's date is 2026-04-07."
match = re.search(date_pattern, text)
if match:
print(f"Full: {match.group(0)}") # Output: Full: 2026-04-07
print(f"Year: {match.group(1)}") # Output: Year: 2026
print(f"Month: {match.group(2)}") # Output: Month: 04
print(f"Day: {match.group(3)}") # Output: Day: 07
# Named groups — improved readability
named_pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"
match = re.search(named_pattern, text)
if match:
print(f"Year: {match.group('year')}") # Output: Year: 2026
print(f"Month: {match.group('month')}") # Output: Month: 04
# Non-capturing group — groups without capturing
non_capture = r"(?:http|https)://(\S+)"
url_text = "Visit: https://example.com/path"
match = re.search(non_capture, url_text)
if match:
print(f"Domain+path: {match.group(1)}") # Output: Domain+path: example.com/path
# group(1) is the domain+path, not the protocol (thanks to non-capturing group)
Lookahead and Lookbehind
These check whether a specific condition exists before or after the pattern, without including it in the match result.
import re
# Lookahead — match only when followed by a specific pattern
# Find numbers followed by "USD"
text = "Apple $3000, Pear $5000, Qty 3pcs"
prices = re.findall(r"\d+(?=\$)", "3000$ 5000$ 3pcs")
# Better example:
text = "Apple 3000USD, Pear 5000USD, Qty 3pcs"
prices = re.findall(r"\d+(?=USD)", text)
print(prices) # Output: ['3000', '5000'] ("3" not included — not followed by "USD")
# Negative lookahead — match when NOT followed by a specific pattern
not_price = re.findall(r"\d+(?!USD)", text)
print(not_price) # Output: ['300', '500', '3']
# Lookbehind — match only when preceded by a specific pattern
# Extract only numbers after "$"
text2 = "Price: $100, Qty: 50pcs, Discount: $30"
dollar_amounts = re.findall(r"(?<=\$)\d+", text2)
print(dollar_amounts) # Output: ['100', '30'] ("50" not included)
# Combination — extract content inside specific tags
html = "Name: <b>Alice</b>, Age: <b>30</b>"
bold_contents = re.findall(r"(?<=<b>).+?(?=</b>)", html)
print(bold_contents) # Output: ['Alice', '30']
Greedy vs. Lazy Matching
import re
text = '<div>First</div><div>Second</div>'
# Greedy — matches as much as possible (default)
greedy = re.findall(r"<div>.*</div>", text)
print(greedy) # Output: ['<div>First</div><div>Second</div>'] (matched everything as one)
# Lazy — matches as little as possible (add ?)
lazy = re.findall(r"<div>.*?</div>", text)
print(lazy) # Output: ['<div>First</div>', '<div>Second</div>'] (matched separately)
Real-World Pattern Collection
import re
# 1. Email address validation
email_pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
emails = ["user@example.com", "invalid@", "test@co.kr"]
for email in emails:
valid = bool(re.match(email_pattern, email))
print(f"{email}: {'valid' if valid else 'invalid'}")
# Output: user@example.com: valid
# invalid@: invalid
# test@co.kr: valid
# 2. US phone number (various formats)
phone_pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
phones = ["(555) 123-4567", "555.123.4567", "5551234567"]
for phone in phones:
match = re.search(phone_pattern, phone)
print(f"{phone}: {'match' if match else 'no match'}")
# Output: (555) 123-4567: match
# 555.123.4567: match
# 5551234567: match
# 3. Password strength validation (8+ chars, letters + numbers + special chars)
def validate_password(pw):
"""Validates password strength."""
checks = {
"8+ chars": r".{8,}",
"Has letters": r"[a-zA-Z]",
"Has numbers": r"\d",
"Has special chars": r"[!@#$%^&*(),.?\":{}|]"
}
results = {}
for name, pattern in checks.items():
results[name] = bool(re.search(pattern, pw))
return results
print(validate_password("Abc123!@"))
# Output: {'8+ chars': True, 'Has letters': True, 'Has numbers': True, 'Has special chars': True}
# 4. String substitution — personal info masking
def mask_personal_info(text):
"""Masks phone numbers and emails."""
# Phone number masking
text = re.sub(
r"(\d{3})-(\d{3,4})-(\d{4})",
r"\1-****-\3", # Replace middle digits with ****
text
)
# Email masking
text = re.sub(
r"([a-zA-Z0-9._%+-]{2})([a-zA-Z0-9._%+-]*)(@\S+)",
r"\1***\3", # Keep first 2 chars, replace rest with ***
text
)
return text
sample = "Contact: 010-1234-5678, Email: hong@example.com"
print(mask_personal_info(sample))
# Output: Contact: 010-****-5678, Email: ho***@example.com
# 5. Extract IP addresses from logs
log_text = """
[2026-04-07 10:00:01] 192.168.1.100 GET /api/users 200
[2026-04-07 10:00:02] 10.0.0.55 POST /api/login 401
[2026-04-07 10:00:03] 172.16.0.1 GET /api/health 200
"""
ip_pattern = r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
ips = re.findall(ip_pattern, log_text)
print(f"Found IPs: {ips}")
# Output: Found IPs: ['192.168.1.100', '10.0.0.55', '172.16.0.1']
Regular Expressions in JavaScript
// JavaScript regex basics
const emailRegex = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/;
console.log(emailRegex.test("user@example.com")); // true
console.log(emailRegex.test("invalid")); // false
// Named groups (ES2018+)
const dateRegex = /(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2})/;
const match = "2026-04-07".match(dateRegex);
console.log(match.groups.year); // "2026"
console.log(match.groups.month); // "04"
// replaceAll + capture groups
const text = "2026-04-07 and 2026-12-25";
const formatted = text.replaceAll(
/(\d{4})-(\d{2})-(\d{2})/g,
"$1/$2/$3"
);
console.log(formatted);
// Output: "2026/04/07 and 2026/12/25"
Summary
| Scenario | Recommended Pattern |
|---|---|
| Check if a string contains text | str.includes() (no regex needed) |
| Simple format validation | Basic metacharacters + quantifiers |
| Data extraction | Capture groups () |
| Conditional matching | Lookahead/lookbehind |
| String replacement | re.sub() + backreferences \1 |
| Complex parsing | Use a dedicated parser instead of regex |
- Regex is not a silver bullet: For nested structures like HTML or JSON parsing, use a dedicated parser.
- Reuse with
re.compile(): When using the same pattern repeatedly, a compiled object is faster. - Use the
re.VERBOSEflag: Add comments and whitespace to complex regex for improved readability. - Test at regex101.com: Verify match results in real-time and see explanations for each token.