capcat.core.image_processor

File: Application/capcat/core/image_processor.py

Description

Global Image Processor for Capcat. Modular, DRY architecture with source-specific configurations.

Constants

_MIN_PIXEL_DIMENSION

Value: 64

_EXTS

Value: ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']

Classes

ImageProcessor

Global image processing coordinator. Uses source-specific configurations for clean, modular processing.

Methods

init
def __init__(self, session: Optional[requests.Session] = None)

Parameters:

  • self
  • session (Optional[requests.Session]) optional
process_article_images
def process_article_images(self, html_content: str, source_config: dict, base_url: str, output_folder: str, page_title: str = '', media_enabled: bool = False, article_url: str = '', min_pixel_dimension: int = 0, max_image_bytes: int = 0) -> Dict[str, str]

Process images for an article using source-specific configuration. Includes intelligent protection against aggregator sites.

Args: html_content: Raw HTML content source_config: Source configuration with image processing rules base_url: Base URL for resolving relative links output_folder: Article output folder path page_title: Page title for classification media_enabled: Whether –media flag is enabled (affects limits)

Returns: Dict mapping original URLs to local filenames

Parameters:

  • self
  • html_content (str)
  • source_config (dict)
  • base_url (str)
  • output_folder (str)
  • page_title (str) optional
  • media_enabled (bool) optional
  • article_url (str) optional
  • min_pixel_dimension (int) optional
  • max_image_bytes (int) optional

Returns: Dict[str, str]

_read_image_dimensions
def _read_image_dimensions(filepath: str) -> Optional[Tuple[int, int]]

Return (width, height) for PNG, JPEG, or WebP files by reading header bytes. Returns None if the format is unrecognised or parsing fails.

Parameters:

  • filepath (str)

Returns: Optional[Tuple[int, int]]

⚠️ High complexity: 27

_extract_image_urls
def _extract_image_urls(self, html_content: str, img_config: dict, base_url: str) -> List[str]

Extract image URLs using source-specific selectors.

Parameters:

  • self
  • html_content (str)
  • img_config (dict)
  • base_url (str)

Returns: List[str]

⚠️ High complexity: 18

_should_skip_image
def _should_skip_image(self, img_element, skip_selectors: List[str]) -> bool

Check if image should be skipped based on skip selectors.

Parameters:

  • self
  • img_element
  • skip_selectors (List[str])

Returns: bool

_matches_url_patterns
def _matches_url_patterns(self, url: str, patterns: List[str]) -> bool

Check if URL matches any of the specified patterns.

Parameters:

  • self
  • url (str)
  • patterns (List[str])

Returns: bool

_is_valid_image_url
def _is_valid_image_url(self, url: str, allow_extensionless: bool = False) -> bool

Validate image URL.

Parameters:

  • self
  • url (str)
  • allow_extensionless (bool) optional

Returns: bool

_download_images
def _download_images(self, image_urls: List[str], output_folder: str, max_total_size_mb: int = 20, allow_large_files: bool = False) -> Dict[str, str]

Download images with size limits and return URL to filename mapping.

Parameters:

  • self
  • image_urls (List[str])
  • output_folder (str)
  • max_total_size_mb (int) optional
  • allow_large_files (bool) optional

Returns: Dict[str, str]

_download_images_with_checking
def _download_images_with_checking(self, image_urls: List[str], output_folder: str, media_enabled: bool = False, min_image_size: int = 0, referer: str = '', min_pixel_dimension: int = 0, max_image_bytes: int = 0) -> Dict[str, str]

Download images applying all active filters as a pipeline.

Filter order:

  1. max_image_bytes: HEAD check before download; reject if too large.
  2. min_image_size: byte floor after download; delete if too small.
  3. min_pixel_dimension: pixel check after download; delete if too small. Filters are independent and all active ones are applied.

Parameters:

  • self
  • image_urls (List[str])
  • output_folder (str)
  • media_enabled (bool) optional
  • min_image_size (int) optional
  • referer (str) optional
  • min_pixel_dimension (int) optional
  • max_image_bytes (int) optional

Returns: Dict[str, str]

_download_single_image_filtered
def _download_single_image_filtered(self, url: str, images_dir: str, counter: int, max_image_bytes: int = 0, min_image_size: int = 0, min_pixel_dimension: int = 0, referer: str = '') -> Optional[str]

Download one image and apply all active filters.

Applies max_image_bytes (pre-download HEAD), min_image_size (post-download byte floor), and min_pixel_dimension (post-download pixel floor) in sequence. Any failing filter removes the file and returns None.

Parameters:

  • self
  • url (str)
  • images_dir (str)
  • counter (int)
  • max_image_bytes (int) optional
  • min_image_size (int) optional
  • min_pixel_dimension (int) optional
  • referer (str) optional

Returns: Optional[str]

⚠️ High complexity: 14

_has_explicit_source_config
def _has_explicit_source_config(self, source_config: Dict) -> bool

Check if source has explicit configuration (not a generic/discovered source).

Parameters:

  • self
  • source_config (Dict)

Returns: bool

_download_single_image_simple
def _download_single_image_simple(self, url: str, images_dir: str, counter: int, referer: str = '') -> Optional[str]

Download single image with simple error handling.

Parameters:

  • self
  • url (str)
  • images_dir (str)
  • counter (int)
  • referer (str) optional

Returns: Optional[str]

_download_single_image_with_min_size
def _download_single_image_with_min_size(self, url: str, images_dir: str, counter: int, min_size: int, referer: str = '') -> Optional[str]

Download single image with minimum size filtering.

Parameters:

  • self
  • url (str)
  • images_dir (str)
  • counter (int)
  • min_size (int)
  • referer (str) optional

Returns: Optional[str]

_download_single_image_with_min_pixels
def _download_single_image_with_min_pixels(self, url: str, images_dir: str, counter: int, min_pixel_dimension: int, referer: str = '') -> Optional[str]

Download image and reject it if both dimensions are below min_pixel_dimension.

Parameters:

  • self
  • url (str)
  • images_dir (str)
  • counter (int)
  • min_pixel_dimension (int)
  • referer (str) optional

Returns: Optional[str]

_download_single_image_with_max_bytes
def _download_single_image_with_max_bytes(self, url: str, images_dir: str, counter: int, max_bytes: int, referer: str = '') -> Optional[str]

Skip download if content-length exceeds max_bytes; download otherwise.

Parameters:

  • self
  • url (str)
  • images_dir (str)
  • counter (int)
  • max_bytes (int)
  • referer (str) optional

Returns: Optional[str]

_download_single_image_with_size_check
def _download_single_image_with_size_check(self, url: str, images_dir: str, counter: int, remaining_bytes: int, allow_large_files: bool = False) -> Tuple[Optional[str], int]

Download single image with size checking and return (filename, size).

Parameters:

  • self
  • url (str)
  • images_dir (str)
  • counter (int)
  • remaining_bytes (int)
  • allow_large_files (bool) optional

Returns: Tuple[Optional[str], int]

_download_single_image
def _download_single_image(self, url: str, images_dir: str, counter: int) -> Optional[str]

Download single image and return filename (legacy method).

Parameters:

  • self
  • url (str)
  • images_dir (str)
  • counter (int)

Returns: Optional[str]

_generate_filename
def _generate_filename(self, url: str, content_type: Optional[str] = None) -> str

Generate clean filename from URL.

Parameters:

  • self
  • url (str)
  • content_type (Optional[str]) optional

Returns: str

_get_extension_from_content_type
def _get_extension_from_content_type(self, content_type: str) -> Optional[str]

Get file extension from content type.

Parameters:

  • self
  • content_type (str)

Returns: Optional[str]

_get_extension_from_url_or_content
def _get_extension_from_url_or_content(self, url: str, content_type: Optional[str] = None) -> str

Get file extension from URL or content type, defaulting to .jpg.

Parameters:

  • self
  • url (str)
  • content_type (Optional[str]) optional

Returns: str

replace_image_urls
def replace_image_urls(markdown_content: str, url_mapping: Dict[str, str]) -> str

Clean URL replacement in markdown content. DRY approach with systematic pattern matching.

Parameters:

  • markdown_content (str)
  • url_mapping (Dict[str, str])

Returns: str

_apply_url_patterns
def _apply_url_patterns(content: str, original_url: str, local_path: str) -> str

Apply systematic URL replacement patterns.

Only uses exact-match strategies. Broad fallback patterns (basename matching, base-URL-without-query) are unsafe: CDN proxy URLs like /.netlify/images or /_next/image share the same path across all images and differ only in query parameters, so a loose regex overwrites every image reference to the last one processed.

Parameters:

  • content (str)
  • original_url (str)
  • local_path (str)

Returns: str

Functions

get_image_processor

def get_image_processor(session: Optional[requests.Session] = None) -> ImageProcessor

Get ImageProcessor instance.

Parameters:

  • session (Optional[requests.Session]) optional

Returns: ImageProcessor