capcat.core.ethical_scraping
File: Application/capcat/core/ethical_scraping.py
Description
Ethical scraping utilities for Capcat.
Implements best practices:
- Robots.txt caching with 15-minute TTL
- 429/503 error handling with exponential backoff
- Rate limiting enforcement
- Path validation against robots.txt
Constants
_HN_API_USER_AGENT
Value: 'Capcat/2.0 (Personal news archiver; uses official HN API)'
_HN_API_DOMAIN
Value: 'hacker-news.firebaseio.com'
_HN_API_MIN_DELAY
Value: 0.05
Classes
RobotsTxtCache
Cache entry for robots.txt.
EthicalScrapingManager
Manages ethical scraping compliance.
Features:
- Robots.txt caching (15-minute TTL)
- Crawl delay enforcement
- 429/503 exponential backoff
- Path validation
Methods
init
def __init__(self, user_agent: str = 'Capcat/2.0')
Initialize ethical scraping manager.
Args: user_agent: User agent string for requests
Parameters:
selfuser_agent(str) optional
configure
def configure(self, crawl_delay: float, robots_cache_ttl_minutes: int) -> None
Update rate-limiting parameters on the singleton after config is loaded.
Parameters:
selfcrawl_delay(float)robots_cache_ttl_minutes(int)
Returns: None
get_robots_txt
def get_robots_txt(self, base_url: str, timeout: int = 10) -> Tuple[RobotFileParser, float]
Fetch and parse robots.txt with caching.
Args: base_url: Base URL of the site timeout: Request timeout in seconds
Returns: Tuple of (RobotFileParser, crawl_delay)
Parameters:
selfbase_url(str)timeout(int) optional
Returns: Tuple[RobotFileParser, float]
_extract_crawl_delay
def _extract_crawl_delay(self, parser: RobotFileParser) -> float
Extract crawl delay from robots.txt parser.
Args: parser: RobotFileParser instance
Returns: Crawl delay in seconds (0.0 if not specified)
Parameters:
selfparser(RobotFileParser)
Returns: float
can_fetch
def can_fetch(self, url: str) -> Tuple[bool, str]
Check if URL can be fetched according to robots.txt.
Args: url: URL to check
Returns: Tuple of (allowed, reason)
Parameters:
selfurl(str)
Returns: Tuple[bool, str]
enforce_rate_limit
def enforce_rate_limit(self, domain: str, crawl_delay: float, min_delay: float = 1.0)
Enforce rate limiting with crawl delay - thread-safe via slot reservation.
The lock is held only while reading/updating last_request_time (microseconds). Sleep happens outside the lock so other domains are not blocked. Each thread reserves its firing slot so the next thread queues correctly.
Args: domain: Domain being accessed crawl_delay: Required crawl delay from robots.txt min_delay: Minimum delay even if robots.txt doesn’t specify
Parameters:
selfdomain(str)crawl_delay(float)min_delay(float) optional
request_with_backoff
def request_with_backoff(self, session: requests.Session, url: str, method: str = 'GET', max_retries: int = 3, initial_delay: float = 1.0) -> requests.Response
Make HTTP request with exponential backoff for 429/503 errors.
Args: session: Requests session url: URL to fetch method: HTTP method (GET, POST, etc.) max_retries: Maximum number of retries initial_delay: Initial retry delay in seconds **kwargs: Additional arguments for requests
Returns: Response object
Raises: requests.RequestException: If all retries fail
Parameters:
selfsession(requests.Session)url(str)method(str) optionalmax_retries(int) optionalinitial_delay(float) optional
Returns: requests.Response
⚠️ High complexity: 12
request_hn_api
def request_hn_api(self, session: requests.Session, url: str, timeout: int = 10, max_retries: int = 3, initial_delay: float = 1.0, skip_rate_limit: bool = False) -> Optional[dict]
Make a request to the HN Firebase API.
Concurrency-safe. When skip_rate_limit is True, no artificial delay is added (used for concurrent comment fetching where the thread pool size is the throttle). Handles 429/503 with exponential backoff.
Args: session: Requests session url: Full Firebase API URL timeout: Request timeout in seconds max_retries: Maximum retry attempts on 429/503 initial_delay: Initial backoff delay in seconds skip_rate_limit: If True, skip the inter-request delay
Returns: Parsed JSON dict, or None if the request fails after retries
Parameters:
selfsession(requests.Session)url(str)timeout(int) optionalmax_retries(int) optionalinitial_delay(float) optionalskip_rate_limit(bool) optional
Returns: Optional[dict]
validate_source_config
def validate_source_config(self, base_url: str, rate_limit: float) -> Tuple[bool, str]
Validate source configuration against robots.txt.
Args: base_url: Base URL of the source rate_limit: Configured rate limit in seconds
Returns: Tuple of (valid, message)
Parameters:
selfbase_url(str)rate_limit(float)
Returns: Tuple[bool, str]
get_cache_stats
def get_cache_stats(self) -> Dict[str, any]
Get statistics about robots.txt cache.
Returns: Dictionary with cache statistics
Parameters:
self
Returns: Dict[str, any]
clear_stale_cache
def clear_stale_cache(self)
Remove stale entries from robots.txt cache.
Parameters:
self
Functions
get_ethical_manager
def get_ethical_manager(user_agent: str = 'Capcat/2.0') -> EthicalScrapingManager
Get or create global ethical scraping manager.
Args: user_agent: User agent string
Returns: EthicalScrapingManager instance
Parameters:
user_agent(str) optional
Returns: EthicalScrapingManager