capcat.core.image_processor
File: Application/capcat/core/image_processor.py
Description
Global Image Processor for Capcat. Modular, DRY architecture with source-specific configurations.
Constants
_MIN_PIXEL_DIMENSION
Value: 64
_EXTS
Value: ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']
Classes
ImageProcessor
Global image processing coordinator. Uses source-specific configurations for clean, modular processing.
Methods
init
def __init__(self, session: Optional[requests.Session] = None)
Parameters:
selfsession(Optional[requests.Session]) optional
process_article_images
def process_article_images(self, html_content: str, source_config: dict, base_url: str, output_folder: str, page_title: str = '', media_enabled: bool = False, article_url: str = '', min_pixel_dimension: int = 0, max_image_bytes: int = 0) -> Dict[str, str]
Process images for an article using source-specific configuration. Includes intelligent protection against aggregator sites.
Args: html_content: Raw HTML content source_config: Source configuration with image processing rules base_url: Base URL for resolving relative links output_folder: Article output folder path page_title: Page title for classification media_enabled: Whether –media flag is enabled (affects limits)
Returns: Dict mapping original URLs to local filenames
Parameters:
selfhtml_content(str)source_config(dict)base_url(str)output_folder(str)page_title(str) optionalmedia_enabled(bool) optionalarticle_url(str) optionalmin_pixel_dimension(int) optionalmax_image_bytes(int) optional
Returns: Dict[str, str]
_read_image_dimensions
def _read_image_dimensions(filepath: str) -> Optional[Tuple[int, int]]
Return (width, height) for PNG, JPEG, or WebP files by reading header bytes. Returns None if the format is unrecognised or parsing fails.
Parameters:
filepath(str)
Returns: Optional[Tuple[int, int]]
⚠️ High complexity: 27
_extract_image_urls
def _extract_image_urls(self, html_content: str, img_config: dict, base_url: str) -> List[str]
Extract image URLs using source-specific selectors.
Parameters:
selfhtml_content(str)img_config(dict)base_url(str)
Returns: List[str]
⚠️ High complexity: 18
_should_skip_image
def _should_skip_image(self, img_element, skip_selectors: List[str]) -> bool
Check if image should be skipped based on skip selectors.
Parameters:
selfimg_elementskip_selectors(List[str])
Returns: bool
_matches_url_patterns
def _matches_url_patterns(self, url: str, patterns: List[str]) -> bool
Check if URL matches any of the specified patterns.
Parameters:
selfurl(str)patterns(List[str])
Returns: bool
_is_valid_image_url
def _is_valid_image_url(self, url: str, allow_extensionless: bool = False) -> bool
Validate image URL.
Parameters:
selfurl(str)allow_extensionless(bool) optional
Returns: bool
_download_images
def _download_images(self, image_urls: List[str], output_folder: str, max_total_size_mb: int = 20, allow_large_files: bool = False) -> Dict[str, str]
Download images with size limits and return URL to filename mapping.
Parameters:
selfimage_urls(List[str])output_folder(str)max_total_size_mb(int) optionalallow_large_files(bool) optional
Returns: Dict[str, str]
_download_images_with_checking
def _download_images_with_checking(self, image_urls: List[str], output_folder: str, media_enabled: bool = False, min_image_size: int = 0, referer: str = '', min_pixel_dimension: int = 0, max_image_bytes: int = 0) -> Dict[str, str]
Download images applying all active filters as a pipeline.
Filter order:
- max_image_bytes: HEAD check before download; reject if too large.
- min_image_size: byte floor after download; delete if too small.
- min_pixel_dimension: pixel check after download; delete if too small. Filters are independent and all active ones are applied.
Parameters:
selfimage_urls(List[str])output_folder(str)media_enabled(bool) optionalmin_image_size(int) optionalreferer(str) optionalmin_pixel_dimension(int) optionalmax_image_bytes(int) optional
Returns: Dict[str, str]
_download_single_image_filtered
def _download_single_image_filtered(self, url: str, images_dir: str, counter: int, max_image_bytes: int = 0, min_image_size: int = 0, min_pixel_dimension: int = 0, referer: str = '') -> Optional[str]
Download one image and apply all active filters.
Applies max_image_bytes (pre-download HEAD), min_image_size (post-download byte floor), and min_pixel_dimension (post-download pixel floor) in sequence. Any failing filter removes the file and returns None.
Parameters:
selfurl(str)images_dir(str)counter(int)max_image_bytes(int) optionalmin_image_size(int) optionalmin_pixel_dimension(int) optionalreferer(str) optional
Returns: Optional[str]
⚠️ High complexity: 14
_has_explicit_source_config
def _has_explicit_source_config(self, source_config: Dict) -> bool
Check if source has explicit configuration (not a generic/discovered source).
Parameters:
selfsource_config(Dict)
Returns: bool
_download_single_image_simple
def _download_single_image_simple(self, url: str, images_dir: str, counter: int, referer: str = '') -> Optional[str]
Download single image with simple error handling.
Parameters:
selfurl(str)images_dir(str)counter(int)referer(str) optional
Returns: Optional[str]
_download_single_image_with_min_size
def _download_single_image_with_min_size(self, url: str, images_dir: str, counter: int, min_size: int, referer: str = '') -> Optional[str]
Download single image with minimum size filtering.
Parameters:
selfurl(str)images_dir(str)counter(int)min_size(int)referer(str) optional
Returns: Optional[str]
_download_single_image_with_min_pixels
def _download_single_image_with_min_pixels(self, url: str, images_dir: str, counter: int, min_pixel_dimension: int, referer: str = '') -> Optional[str]
Download image and reject it if both dimensions are below min_pixel_dimension.
Parameters:
selfurl(str)images_dir(str)counter(int)min_pixel_dimension(int)referer(str) optional
Returns: Optional[str]
_download_single_image_with_max_bytes
def _download_single_image_with_max_bytes(self, url: str, images_dir: str, counter: int, max_bytes: int, referer: str = '') -> Optional[str]
Skip download if content-length exceeds max_bytes; download otherwise.
Parameters:
selfurl(str)images_dir(str)counter(int)max_bytes(int)referer(str) optional
Returns: Optional[str]
_download_single_image_with_size_check
def _download_single_image_with_size_check(self, url: str, images_dir: str, counter: int, remaining_bytes: int, allow_large_files: bool = False) -> Tuple[Optional[str], int]
Download single image with size checking and return (filename, size).
Parameters:
selfurl(str)images_dir(str)counter(int)remaining_bytes(int)allow_large_files(bool) optional
Returns: Tuple[Optional[str], int]
_download_single_image
def _download_single_image(self, url: str, images_dir: str, counter: int) -> Optional[str]
Download single image and return filename (legacy method).
Parameters:
selfurl(str)images_dir(str)counter(int)
Returns: Optional[str]
_generate_filename
def _generate_filename(self, url: str, content_type: Optional[str] = None) -> str
Generate clean filename from URL.
Parameters:
selfurl(str)content_type(Optional[str]) optional
Returns: str
_get_extension_from_content_type
def _get_extension_from_content_type(self, content_type: str) -> Optional[str]
Get file extension from content type.
Parameters:
selfcontent_type(str)
Returns: Optional[str]
_get_extension_from_url_or_content
def _get_extension_from_url_or_content(self, url: str, content_type: Optional[str] = None) -> str
Get file extension from URL or content type, defaulting to .jpg.
Parameters:
selfurl(str)content_type(Optional[str]) optional
Returns: str
replace_image_urls
def replace_image_urls(markdown_content: str, url_mapping: Dict[str, str]) -> str
Clean URL replacement in markdown content. DRY approach with systematic pattern matching.
Parameters:
markdown_content(str)url_mapping(Dict[str, str])
Returns: str
_apply_url_patterns
def _apply_url_patterns(content: str, original_url: str, local_path: str) -> str
Apply systematic URL replacement patterns.
Only uses exact-match strategies. Broad fallback patterns (basename matching, base-URL-without-query) are unsafe: CDN proxy URLs like /.netlify/images or /_next/image share the same path across all images and differ only in query parameters, so a loose regex overwrites every image reference to the last one processed.
Parameters:
content(str)original_url(str)local_path(str)
Returns: str
Functions
get_image_processor
def get_image_processor(session: Optional[requests.Session] = None) -> ImageProcessor
Get ImageProcessor instance.
Parameters:
session(Optional[requests.Session]) optional
Returns: ImageProcessor