All articles
scrapyweb scrapingresidential proxies

Scrapy Proxy Middleware: Rotating IPs Without Bans

JL
James Liu
Lead Engineer @ ProxyLabs
March 15, 2026
6 min read
Share

Scrapy's default retry middleware doesn't understand proxy failures. It retries with the same proxy IP, which means if that IP is blocked, you're burning bandwidth on guaranteed failures. A proper proxy middleware needs to rotate IPs on each retry, detect soft bans (200 responses with CAPTCHA pages), and track per-IP success rates so you can flag bad subnets.

This guide builds a production-grade proxy middleware for Scrapy that handles all of this.

Basic Proxy Setup in Scrapy

The simplest approach — set the proxy in settings.py:

# settings.py
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
}

# Rotating proxy — each request gets a new IP
HTTP_PROXY = 'http://your-username:[email protected]:8080'
HTTPS_PROXY = 'http://your-username:[email protected]:8080'

This works but gives you zero control over rotation logic, error handling, or geo-targeting. For anything beyond a toy project, you need a custom middleware.

Custom Proxy Rotation Middleware

# middlewares/proxy_middleware.py
import logging
import uuid
from scrapy import signals
from scrapy.exceptions import NotConfigured

logger = logging.getLogger(__name__)


class RotatingProxyMiddleware:
    """Assigns a rotating residential proxy to each request.
    
    Supports geo-targeting via spider attributes and sticky sessions
    for multi-request sequences.
    """

    def __init__(self, proxy_host, proxy_port, proxy_user, proxy_pass):
        self.proxy_host = proxy_host
        self.proxy_port = proxy_port
        self.proxy_user = proxy_user
        self.proxy_pass = proxy_pass
        self.request_count = 0
        self.bandwidth_bytes = 0

    @classmethod
    def from_crawler(cls, crawler):
        host = crawler.settings.get('PROXY_HOST', 'gate.proxylabs.app')
        port = crawler.settings.get('PROXY_PORT', '8080')
        user = crawler.settings.get('PROXY_USER')
        password = crawler.settings.get('PROXY_PASS')

        if not user or not password:
            raise NotConfigured('PROXY_USER and PROXY_PASS must be set')

        middleware = cls(host, port, user, password)
        crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
        return middleware

    def process_request(self, request, spider):
        username = self.proxy_user

        # Geo-targeting: set spider.proxy_country = 'US' etc.
        country = getattr(spider, 'proxy_country', None)
        if country:
            username += f'-country-{country}'

        city = getattr(spider, 'proxy_city', None)
        if city:
            username += f'-city-{city}'

        # Sticky sessions: set request.meta['proxy_session'] for multi-page flows
        session_id = request.meta.get('proxy_session')
        if session_id:
            username += f'-session-{session_id}'

        proxy_url = f'http://{username}:{self.proxy_pass}@{self.proxy_host}:{self.proxy_port}'
        request.meta['proxy'] = proxy_url
        self.request_count += 1

    def process_response(self, request, response, spider):
        self.bandwidth_bytes += len(response.body)

        # Detect soft bans — site returns 200 but with CAPTCHA/block page
        if self._is_soft_ban(response):
            logger.warning(f"Soft ban detected on {request.url} — retrying with new IP")
            request.meta.pop('proxy_session', None)  # Force new IP
            return request  # Returning request triggers retry

        return response

    def process_exception(self, request, exception, spider):
        logger.error(f"Proxy error on {request.url}: {exception}")
        # Request will be retried by RetryMiddleware with a new IP
        # because process_request assigns a new proxy each time
        return None

    def _is_soft_ban(self, response):
        """Check common soft-ban indicators."""
        if response.status in (403, 429, 503):
            return True
        body_lower = response.text[:2000].lower() if hasattr(response, 'text') else ''
        ban_signals = ['captcha', 'blocked', 'access denied', 'rate limit',
                       'please verify', 'unusual traffic']
        return any(signal in body_lower for signal in ban_signals)

    def spider_closed(self, spider):
        mb = self.bandwidth_bytes / (1024 * 1024)
        logger.info(
            f"Proxy stats: {self.request_count} requests, "
            f"{mb:.1f} MB bandwidth used"
        )

Settings Configuration

# settings.py
PROXY_HOST = 'gate.proxylabs.app'
PROXY_PORT = '8080'
PROXY_USER = 'your-username'
PROXY_PASS = 'your-password'

DOWNLOADER_MIDDLEWARES = {
    # Disable default proxy middleware
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
    # Enable our custom middleware
    'myproject.middlewares.proxy_middleware.RotatingProxyMiddleware': 350,
    # Keep retry middleware active — it works with our middleware
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
}

RETRY_TIMES = 3
RETRY_HTTP_CODES = [403, 429, 500, 502, 503]

# Concurrency settings — be reasonable with residential proxies
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 8
DOWNLOAD_DELAY = 0.5  # 500ms between requests to same domain
DOWNLOAD_TIMEOUT = 30

Geo-Targeted Spider Example

# spiders/price_spider.py
import scrapy
import uuid


class PriceSpider(scrapy.Spider):
    name = 'price_spider'
    proxy_country = 'US'  # All requests go through US IPs

    start_urls = [
        'https://example.com/product/1',
        'https://example.com/product/2',
    ]

    def parse(self, response):
        yield {
            'url': response.url,
            'price': response.css('.price::text').get(),
            'title': response.css('h1::text').get(),
            'proxy_ip': response.headers.get('X-Via', b'').decode(),
        }


class MultiPageSpider(scrapy.Spider):
    """Spider that needs sticky sessions for login + scrape flow."""
    name = 'multi_page'
    proxy_country = 'GB'

    def start_requests(self):
        session_id = uuid.uuid4().hex[:12]
        yield scrapy.Request(
            'https://example.com/login',
            meta={'proxy_session': session_id, 'session_id': session_id},
            callback=self.parse_login,
        )

    def parse_login(self, response):
        session_id = response.meta['session_id']
        token = response.css('input[name="csrf"]::attr(value)').get()
        yield scrapy.FormRequest(
            'https://example.com/login',
            formdata={'username': 'user', 'password': 'pass', 'csrf': token},
            meta={'proxy_session': session_id, 'session_id': session_id},
            callback=self.after_login,
        )

    def after_login(self, response):
        session_id = response.meta['session_id']
        # Same session_id = same proxy IP throughout the login flow
        for url in response.css('a.product::attr(href)').getall():
            yield scrapy.Request(
                response.urljoin(url),
                meta={'proxy_session': session_id, 'session_id': session_id},
                callback=self.parse_product,
            )

    def parse_product(self, response):
        yield {
            'url': response.url,
            'price': response.css('.price::text').get(),
        }

Advanced: Bandwidth-Aware Middleware

If you're tracking costs (ProxyLabs charges per GB, starting at £2.50/GB for 100GB), this variant adds bandwidth tracking and lets you set a budget limit:

# middlewares/budget_proxy_middleware.py
import logging
from scrapy.exceptions import CloseSpider

logger = logging.getLogger(__name__)


class BudgetProxyMiddleware:
    """Extends RotatingProxyMiddleware with bandwidth budget enforcement."""

    def __init__(self, proxy_host, proxy_port, proxy_user, proxy_pass, max_mb):
        self.proxy_host = proxy_host
        self.proxy_port = proxy_port
        self.proxy_user = proxy_user
        self.proxy_pass = proxy_pass
        self.max_bytes = max_mb * 1024 * 1024
        self.total_bytes = 0

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            proxy_host=crawler.settings.get('PROXY_HOST', 'gate.proxylabs.app'),
            proxy_port=crawler.settings.get('PROXY_PORT', '8080'),
            proxy_user=crawler.settings.get('PROXY_USER'),
            proxy_pass=crawler.settings.get('PROXY_PASS'),
            max_mb=crawler.settings.getfloat('PROXY_BUDGET_MB', 1024),  # 1GB default
        )

    def process_request(self, request, spider):
        if self.total_bytes >= self.max_bytes:
            raise CloseSpider(f'Bandwidth budget exceeded: {self.total_bytes / 1024 / 1024:.0f} MB')

        username = self.proxy_user
        country = getattr(spider, 'proxy_country', None)
        if country:
            username += f'-country-{country}'

        proxy_url = f'http://{username}:{self.proxy_pass}@{self.proxy_host}:{self.proxy_port}'
        request.meta['proxy'] = proxy_url

    def process_response(self, request, response, spider):
        self.total_bytes += len(response.body)
        remaining_mb = (self.max_bytes - self.total_bytes) / 1024 / 1024

        if remaining_mb < 100:
            logger.warning(f"Low bandwidth budget: {remaining_mb:.0f} MB remaining")

        return response
# settings.py addition
PROXY_BUDGET_MB = 500  # Stop after 500MB of proxy traffic

Performance Tuning

Scrapy with residential proxies has different performance characteristics than direct scraping:

SettingDirect scrapingWith residential proxyWhy
CONCURRENT_REQUESTS32-648-16Proxy gateway handles connection pooling
DOWNLOAD_DELAY00.25-1.0Lower ban rate, better IP reputation
DOWNLOAD_TIMEOUT1530Residential IPs have variable latency
RETRY_TIMES23-5Each retry gets a new IP
AUTOTHROTTLE_ENABLEDOptionalRecommendedAdapts to target site's response time

Enable autothrottle for production scrapers:

# settings.py
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
AUTOTHROTTLE_TARGET_CONCURRENCY = 8.0

Debugging Proxy Issues

When things aren't working, add this to your spider:

class DebugSpider(scrapy.Spider):
    name = 'debug'
    start_urls = ['https://httpbin.org/ip']

    def parse(self, response):
        import json
        ip_data = json.loads(response.text)
        self.logger.info(f"Exit IP: {ip_data['origin']}")
        self.logger.info(f"Proxy used: {response.request.meta.get('proxy', 'none')}")
        self.logger.info(f"Response status: {response.status}")
        self.logger.info(f"Response size: {len(response.body)} bytes")

You can also pipe the IP through ProxyLabs' IP Lookup to verify it's residential and in the correct location.

Combining with scrapy-playwright

For JavaScript-rendered pages, combine the proxy middleware with scrapy-playwright:

# settings.py
DOWNLOAD_HANDLERS = {
    "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
    "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}

PLAYWRIGHT_LAUNCH_OPTIONS = {
    "proxy": {
        "server": "http://gate.proxylabs.app:8080",
        "username": "your-username",
        "password": "your-password",
    }
}
# In your spider
yield scrapy.Request(
    url='https://example.com/spa-page',
    meta={
        'playwright': True,
        'playwright_page_methods': [
            {'method': 'wait_for_selector', 'args': ['.product-grid']},
        ],
    },
    callback=self.parse_rendered,
)

For full Playwright proxy configuration details, see our Playwright proxy setup tutorial. For broader anti-blocking strategies, check scraping without getting blocked.

Ready to try the fastest residential proxies?

Join developers and businesses who trust ProxyLabs for mission-critical proxy infrastructure.

~200ms responseBest anti-bot bypass£2.50/GB
Start Building NowNo subscription required
scrapyweb scrapingresidential proxiespythonmiddlewareproxy rotation
JL
James Liu
Lead Engineer @ ProxyLabs

Building proxy infrastructure since 2019. Previously failed at many things, now failing slightly less.

Found this helpful? Share it with others.

Share