What Is Web Scraping?
Web scraping is a technique for automatically extracting desired data from web pages. It is used for news monitoring, price comparison, data analysis, and more. In Python, the most widely used combination is requests for fetching web pages and BeautifulSoup for parsing HTML.
This article covers basic HTML parsing, error handling, pagination, and polite scraping patterns.
Installation
pip install requests beautifulsoup4 lxml
# requests: HTTP request library
# beautifulsoup4: HTML/XML parser
# lxml: High-performance parser engine (optional but recommended)
Basic Usage — Fetching and Parsing a Page
import requests
from bs4 import BeautifulSoup
# 1. Fetch the web page
url = "https://example.com"
response = requests.get(url, timeout=10)
response.raise_for_status() # Raise exception on HTTP errors
# 2. Parse the HTML
soup = BeautifulSoup(response.text, "lxml")
# 3. Extract data
title = soup.find("title").get_text()
print(f"Page title: {title}")
# Page title: Example Domain
# Extract all links
links = soup.find_all("a")
for link in links:
href = link.get("href", "N/A")
text = link.get_text(strip=True)
print(f" {text} -> {href}")
| Parser | Speed | Installation | Features |
|---|---|---|---|
html.parser | Moderate | Built-in | No additional installation needed |
lxml | Fast | pip install lxml | C-based, recommended |
html5lib | Slow | pip install html5lib | Parses like a browser |
Finding Elements with CSS Selectors
select() and select_one() use CSS selector syntax for intuitive element traversal.
from bs4 import BeautifulSoup
html = """
<div class="product-list">
<div class="product" data-id="1">
<h3 class="name">Python Beginner's Guide</h3>
<span class="price">$25.00</span>
<span class="rating">★ 4.8</span>
</div>
<div class="product" data-id="2">
<h3 class="name">JavaScript: The Definitive Guide</h3>
<span class="price">$32.00</span>
<span class="rating">★ 4.5</span>
</div>
<div class="product" data-id="3">
<h3 class="name">Go Concurrency Programming</h3>
<span class="price">$28.00</span>
<span class="rating">★ 4.9</span>
</div>
</div>
"""
soup = BeautifulSoup(html, "lxml")
# Extract all products with CSS selectors
products = soup.select("div.product")
for product in products:
name = product.select_one("h3.name").get_text()
price = product.select_one("span.price").get_text()
rating = product.select_one("span.rating").get_text()
data_id = product.get("data-id")
print(f"[{data_id}] {name} | {price} | {rating}")
# [1] Python Beginner's Guide | $25.00 | ★ 4.8
# [2] JavaScript: The Definitive Guide | $32.00 | ★ 4.5
# [3] Go Concurrency Programming | $28.00 | ★ 4.9
| Selector | Meaning | Example |
|---|---|---|
tag | Tag name | soup.select("h3") |
.class | Class | soup.select(".price") |
#id | ID | soup.select("#header") |
parent child | Descendant element | soup.select("div.product h3") |
[attr=val] | Attribute | soup.select("[data-id='1']") |
Practical Pattern — A Safe Scraper Class
Real-world scraping requires error handling, delays between requests, and User-Agent configuration.
import requests
from bs4 import BeautifulSoup
import time
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class WebScraper:
"""A polite web scraper"""
def __init__(self, delay: float = 1.0):
self.session = requests.Session()
self.session.headers.update({
# Set User-Agent to avoid bot blocking
"User-Agent": (
"Mozilla/5.0 (compatible; "
"MyBot/1.0; +https://example.com/bot)"
),
"Accept-Language": "en-US,en;q=0.9",
})
self.delay = delay # Delay between requests (seconds)
def fetch_page(self, url: str) -> BeautifulSoup | None:
"""Fetches and parses a page."""
try:
response = self.session.get(url, timeout=15)
response.raise_for_status()
response.encoding = response.apparent_encoding
time.sleep(self.delay) # Prevent server overload
return BeautifulSoup(response.text, "lxml")
except requests.RequestException as e:
logger.error("Request failed [%s]: %s", url, e)
return None
def extract_text(self, soup: BeautifulSoup,
selector: str) -> str:
"""Safely extracts text using a CSS selector."""
element = soup.select_one(selector)
return element.get_text(strip=True) if element else ""
# Usage example
scraper = WebScraper(delay=1.5)
soup = scraper.fetch_page("https://example.com")
if soup:
title = scraper.extract_text(soup, "title")
logger.info("Page title: %s", title)
Using requests.Session automatically maintains cookies and headers, which is also useful for scraping that requires a logged-in state.
Handling Pagination
A pattern for collecting data that spans multiple pages.
import requests
from bs4 import BeautifulSoup
import time
def scrape_paginated(base_url: str,
max_pages: int = 10) -> list[dict]:
"""Collects data from a paginated site."""
all_items = []
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
print(f"Collecting page {page}: {url}")
response = requests.get(url, timeout=15)
if response.status_code != 200:
print(f"Page {page} request failed: {response.status_code}")
break
soup = BeautifulSoup(response.text, "lxml")
items = soup.select("div.item")
# Stop if no more items
if not items:
print(f"Page {page}: no items — stopping collection")
break
for item in items:
all_items.append({
"title": item.select_one(".title").get_text(strip=True),
"link": item.select_one("a")["href"],
})
print(f" -> {len(items)} items collected")
time.sleep(1.5) # Prevent server overload
print(f"Total {len(all_items)} items collected")
return all_items
Saving Data — CSV and JSON
How to save collected data to files.
import csv
import json
# Save as CSV
def save_to_csv(data: list[dict], filename: str) -> None:
"""Saves a list of dictionaries to a CSV file."""
if not data:
return
with open(filename, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
print(f"CSV saved: {filename} ({len(data)} records)")
# Save as JSON
def save_to_json(data: list[dict], filename: str) -> None:
"""Saves a list of dictionaries to a JSON file."""
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"JSON saved: {filename} ({len(data)} records)")
# Usage example
products = [
{"name": "Python Beginner's Guide", "price": 25000},
{"name": "JavaScript Essentials", "price": 32000},
]
save_to_csv(products, "products.csv")
save_to_json(products, "products.json")
# CSV saved: products.csv (2 records)
# JSON saved: products.json (2 records)
Using encoding="utf-8-sig" when saving CSV files ensures that special characters display correctly when opened in Excel.
Checking robots.txt
Before scraping, you should check the site’s robots.txt to determine whether scraping is allowed.
from urllib.robotparser import RobotFileParser
def can_scrape(url: str, user_agent: str = "*") -> bool:
"""Checks robots.txt to determine if scraping is allowed."""
from urllib.parse import urlparse
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
rp = RobotFileParser()
rp.set_url(robots_url)
try:
rp.read()
return rp.can_fetch(user_agent, url)
except Exception:
return True # Assume allowed if robots.txt cannot be read
# Usage example
target = "https://example.com/products"
if can_scrape(target):
print("Scraping allowed")
else:
print("Scraping blocked — prohibited by robots.txt")
Practical Tips
- Respect robots.txt: Always check robots.txt before scraping
- Delay between requests: Set a minimum delay of 1 second or more to avoid overloading the server
- User-Agent configuration: Set a User-Agent that includes bot information
- Error handling: Prepare for network errors, 404s, and parsing failures
- Use sessions: Reuse cookies and connections with
requests.Session - Encoding handling: Detect the correct encoding with
response.apparent_encoding - Legal considerations: Scraping copyrighted content or collecting personal data may have legal implications
- Dynamic pages: For pages rendered with JavaScript, use
seleniumorplaywright