Scrapy's default retry middleware doesn't understand proxy failures. It retries with the same proxy IP, which means if that IP is blocked, you're burning bandwidth on guaranteed failures. A proper proxy middleware needs to rotate IPs on each retry, detect soft bans (200 responses with CAPTCHA pages), and track per-IP success rates so you can flag bad subnets.
This guide builds a production-grade proxy middleware for Scrapy that handles all of this.
Basic Proxy Setup in Scrapy
The simplest approach — set the proxy in settings.py:
# settings.py
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
}
# Rotating proxy — each request gets a new IP
HTTP_PROXY = 'http://your-username:[email protected]:8080'
HTTPS_PROXY = 'http://your-username:[email protected]:8080'
This works but gives you zero control over rotation logic, error handling, or geo-targeting. For anything beyond a toy project, you need a custom middleware.
Custom Proxy Rotation Middleware
# middlewares/proxy_middleware.py
import logging
import uuid
from scrapy import signals
from scrapy.exceptions import NotConfigured
logger = logging.getLogger(__name__)
class RotatingProxyMiddleware:
"""Assigns a rotating residential proxy to each request.
Supports geo-targeting via spider attributes and sticky sessions
for multi-request sequences.
"""
def __init__(self, proxy_host, proxy_port, proxy_user, proxy_pass):
self.proxy_host = proxy_host
self.proxy_port = proxy_port
self.proxy_user = proxy_user
self.proxy_pass = proxy_pass
self.request_count = 0
self.bandwidth_bytes = 0
@classmethod
def from_crawler(cls, crawler):
host = crawler.settings.get('PROXY_HOST', 'gate.proxylabs.app')
port = crawler.settings.get('PROXY_PORT', '8080')
user = crawler.settings.get('PROXY_USER')
password = crawler.settings.get('PROXY_PASS')
if not user or not password:
raise NotConfigured('PROXY_USER and PROXY_PASS must be set')
middleware = cls(host, port, user, password)
crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
return middleware
def process_request(self, request, spider):
username = self.proxy_user
# Geo-targeting: set spider.proxy_country = 'US' etc.
country = getattr(spider, 'proxy_country', None)
if country:
username += f'-country-{country}'
city = getattr(spider, 'proxy_city', None)
if city:
username += f'-city-{city}'
# Sticky sessions: set request.meta['proxy_session'] for multi-page flows
session_id = request.meta.get('proxy_session')
if session_id:
username += f'-session-{session_id}'
proxy_url = f'http://{username}:{self.proxy_pass}@{self.proxy_host}:{self.proxy_port}'
request.meta['proxy'] = proxy_url
self.request_count += 1
def process_response(self, request, response, spider):
self.bandwidth_bytes += len(response.body)
# Detect soft bans — site returns 200 but with CAPTCHA/block page
if self._is_soft_ban(response):
logger.warning(f"Soft ban detected on {request.url} — retrying with new IP")
request.meta.pop('proxy_session', None) # Force new IP
return request # Returning request triggers retry
return response
def process_exception(self, request, exception, spider):
logger.error(f"Proxy error on {request.url}: {exception}")
# Request will be retried by RetryMiddleware with a new IP
# because process_request assigns a new proxy each time
return None
def _is_soft_ban(self, response):
"""Check common soft-ban indicators."""
if response.status in (403, 429, 503):
return True
body_lower = response.text[:2000].lower() if hasattr(response, 'text') else ''
ban_signals = ['captcha', 'blocked', 'access denied', 'rate limit',
'please verify', 'unusual traffic']
return any(signal in body_lower for signal in ban_signals)
def spider_closed(self, spider):
mb = self.bandwidth_bytes / (1024 * 1024)
logger.info(
f"Proxy stats: {self.request_count} requests, "
f"{mb:.1f} MB bandwidth used"
)
Settings Configuration
# settings.py
PROXY_HOST = 'gate.proxylabs.app'
PROXY_PORT = '8080'
PROXY_USER = 'your-username'
PROXY_PASS = 'your-password'
DOWNLOADER_MIDDLEWARES = {
# Disable default proxy middleware
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
# Enable our custom middleware
'myproject.middlewares.proxy_middleware.RotatingProxyMiddleware': 350,
# Keep retry middleware active — it works with our middleware
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
}
RETRY_TIMES = 3
RETRY_HTTP_CODES = [403, 429, 500, 502, 503]
# Concurrency settings — be reasonable with residential proxies
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 8
DOWNLOAD_DELAY = 0.5 # 500ms between requests to same domain
DOWNLOAD_TIMEOUT = 30
Geo-Targeted Spider Example
# spiders/price_spider.py
import scrapy
import uuid
class PriceSpider(scrapy.Spider):
name = 'price_spider'
proxy_country = 'US' # All requests go through US IPs
start_urls = [
'https://example.com/product/1',
'https://example.com/product/2',
]
def parse(self, response):
yield {
'url': response.url,
'price': response.css('.price::text').get(),
'title': response.css('h1::text').get(),
'proxy_ip': response.headers.get('X-Via', b'').decode(),
}
class MultiPageSpider(scrapy.Spider):
"""Spider that needs sticky sessions for login + scrape flow."""
name = 'multi_page'
proxy_country = 'GB'
def start_requests(self):
session_id = uuid.uuid4().hex[:12]
yield scrapy.Request(
'https://example.com/login',
meta={'proxy_session': session_id, 'session_id': session_id},
callback=self.parse_login,
)
def parse_login(self, response):
session_id = response.meta['session_id']
token = response.css('input[name="csrf"]::attr(value)').get()
yield scrapy.FormRequest(
'https://example.com/login',
formdata={'username': 'user', 'password': 'pass', 'csrf': token},
meta={'proxy_session': session_id, 'session_id': session_id},
callback=self.after_login,
)
def after_login(self, response):
session_id = response.meta['session_id']
# Same session_id = same proxy IP throughout the login flow
for url in response.css('a.product::attr(href)').getall():
yield scrapy.Request(
response.urljoin(url),
meta={'proxy_session': session_id, 'session_id': session_id},
callback=self.parse_product,
)
def parse_product(self, response):
yield {
'url': response.url,
'price': response.css('.price::text').get(),
}
Advanced: Bandwidth-Aware Middleware
If you're tracking costs (ProxyLabs charges per GB, starting at £2.50/GB for 100GB), this variant adds bandwidth tracking and lets you set a budget limit:
# middlewares/budget_proxy_middleware.py
import logging
from scrapy.exceptions import CloseSpider
logger = logging.getLogger(__name__)
class BudgetProxyMiddleware:
"""Extends RotatingProxyMiddleware with bandwidth budget enforcement."""
def __init__(self, proxy_host, proxy_port, proxy_user, proxy_pass, max_mb):
self.proxy_host = proxy_host
self.proxy_port = proxy_port
self.proxy_user = proxy_user
self.proxy_pass = proxy_pass
self.max_bytes = max_mb * 1024 * 1024
self.total_bytes = 0
@classmethod
def from_crawler(cls, crawler):
return cls(
proxy_host=crawler.settings.get('PROXY_HOST', 'gate.proxylabs.app'),
proxy_port=crawler.settings.get('PROXY_PORT', '8080'),
proxy_user=crawler.settings.get('PROXY_USER'),
proxy_pass=crawler.settings.get('PROXY_PASS'),
max_mb=crawler.settings.getfloat('PROXY_BUDGET_MB', 1024), # 1GB default
)
def process_request(self, request, spider):
if self.total_bytes >= self.max_bytes:
raise CloseSpider(f'Bandwidth budget exceeded: {self.total_bytes / 1024 / 1024:.0f} MB')
username = self.proxy_user
country = getattr(spider, 'proxy_country', None)
if country:
username += f'-country-{country}'
proxy_url = f'http://{username}:{self.proxy_pass}@{self.proxy_host}:{self.proxy_port}'
request.meta['proxy'] = proxy_url
def process_response(self, request, response, spider):
self.total_bytes += len(response.body)
remaining_mb = (self.max_bytes - self.total_bytes) / 1024 / 1024
if remaining_mb < 100:
logger.warning(f"Low bandwidth budget: {remaining_mb:.0f} MB remaining")
return response
# settings.py addition
PROXY_BUDGET_MB = 500 # Stop after 500MB of proxy traffic
Performance Tuning
Scrapy with residential proxies has different performance characteristics than direct scraping:
| Setting | Direct scraping | With residential proxy | Why |
|---|---|---|---|
CONCURRENT_REQUESTS | 32-64 | 8-16 | Proxy gateway handles connection pooling |
DOWNLOAD_DELAY | 0 | 0.25-1.0 | Lower ban rate, better IP reputation |
DOWNLOAD_TIMEOUT | 15 | 30 | Residential IPs have variable latency |
RETRY_TIMES | 2 | 3-5 | Each retry gets a new IP |
AUTOTHROTTLE_ENABLED | Optional | Recommended | Adapts to target site's response time |
Enable autothrottle for production scrapers:
# settings.py
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
AUTOTHROTTLE_TARGET_CONCURRENCY = 8.0
Debugging Proxy Issues
When things aren't working, add this to your spider:
class DebugSpider(scrapy.Spider):
name = 'debug'
start_urls = ['https://httpbin.org/ip']
def parse(self, response):
import json
ip_data = json.loads(response.text)
self.logger.info(f"Exit IP: {ip_data['origin']}")
self.logger.info(f"Proxy used: {response.request.meta.get('proxy', 'none')}")
self.logger.info(f"Response status: {response.status}")
self.logger.info(f"Response size: {len(response.body)} bytes")
You can also pipe the IP through ProxyLabs' IP Lookup to verify it's residential and in the correct location.
Combining with scrapy-playwright
For JavaScript-rendered pages, combine the proxy middleware with scrapy-playwright:
# settings.py
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
PLAYWRIGHT_LAUNCH_OPTIONS = {
"proxy": {
"server": "http://gate.proxylabs.app:8080",
"username": "your-username",
"password": "your-password",
}
}
# In your spider
yield scrapy.Request(
url='https://example.com/spa-page',
meta={
'playwright': True,
'playwright_page_methods': [
{'method': 'wait_for_selector', 'args': ['.product-grid']},
],
},
callback=self.parse_rendered,
)
For full Playwright proxy configuration details, see our Playwright proxy setup tutorial. For broader anti-blocking strategies, check scraping without getting blocked.
Ready to try the fastest residential proxies?
Join developers and businesses who trust ProxyLabs for mission-critical proxy infrastructure.
Building proxy infrastructure since 2019. Previously failed at many things, now failing slightly less.
Related Articles
How to Scrape Amazon Prices in 2026 (Without Getting Blocked)
A working guide to scraping Amazon product prices with residential proxies. Covers their anti-bot stack, request patterns, and code examples in Python.
7 min readResidential Proxies for SEO & SERP Monitoring
How to use residential proxies for accurate SERP tracking, rank monitoring, and SEO audits. Covers geo-targeting, avoiding personalization bias, and code examples.
8 min readContinue exploring
Implementation guides for requests, Scrapy, Axios, Puppeteer, and more.
See how residential proxies fit large-scale scraping workflows.
Evaluate ProxyLabs against Bright Data, Oxylabs, Smartproxy, and others.
Browse location coverage and targeting options across 195+ countries.