capcat.core.image_processor

File: Application/capcat/core/image_processor.py

Description

Global Image Processor for Capcat. Modular, DRY architecture with source-specific configurations.

Constants

_MIN_PIXEL_DIMENSION

Value: 64

_EXTS

Value: ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']

Classes

ImageProcessor

Global image processing coordinator. Uses source-specific configurations for clean, modular processing.

Methods

init

def __init__(self, session: Optional[requests.Session] = None)

Parameters:

self
session (Optional[requests.Session]) optional

process_article_images

def process_article_images(self, html_content: str, source_config: dict, base_url: str, output_folder: str, page_title: str = '', media_enabled: bool = False, article_url: str = '', min_pixel_dimension: int = 0, max_image_bytes: int = 0) -> Dict[str, str]

Process images for an article using source-specific configuration. Includes intelligent protection against aggregator sites.

Args: html_content: Raw HTML content source_config: Source configuration with image processing rules base_url: Base URL for resolving relative links output_folder: Article output folder path page_title: Page title for classification media_enabled: Whether –media flag is enabled (affects limits)

Returns: Dict mapping original URLs to local filenames

Parameters:

self
html_content (str)
source_config (dict)
base_url (str)
output_folder (str)
page_title (str) optional
media_enabled (bool) optional
article_url (str) optional
min_pixel_dimension (int) optional
max_image_bytes (int) optional

Returns: Dict[str, str]

_read_image_dimensions

def _read_image_dimensions(filepath: str) -> Optional[Tuple[int, int]]

Return (width, height) for PNG, JPEG, or WebP files by reading header bytes. Returns None if the format is unrecognised or parsing fails.

Parameters:

filepath (str)

Returns: Optional[Tuple[int, int]]

⚠️ High complexity: 27

_extract_image_urls

def _extract_image_urls(self, html_content: str, img_config: dict, base_url: str) -> List[str]

Extract image URLs using source-specific selectors.

Parameters:

self
html_content (str)
img_config (dict)
base_url (str)

Returns: List[str]

⚠️ High complexity: 18

_should_skip_image

def _should_skip_image(self, img_element, skip_selectors: List[str]) -> bool

Check if image should be skipped based on skip selectors.

Parameters:

self
img_element
skip_selectors (List[str])

Returns: bool

_matches_url_patterns

def _matches_url_patterns(self, url: str, patterns: List[str]) -> bool

Check if URL matches any of the specified patterns.

Parameters:

self
url (str)
patterns (List[str])

Returns: bool

_is_valid_image_url

def _is_valid_image_url(self, url: str, allow_extensionless: bool = False) -> bool

Validate image URL.

Parameters:

self
url (str)
allow_extensionless (bool) optional

Returns: bool

_download_images

def _download_images(self, image_urls: List[str], output_folder: str, max_total_size_mb: int = 20, allow_large_files: bool = False) -> Dict[str, str]

Download images with size limits and return URL to filename mapping.

Parameters:

self
image_urls (List[str])
output_folder (str)
max_total_size_mb (int) optional
allow_large_files (bool) optional

Returns: Dict[str, str]

_download_images_with_checking

def _download_images_with_checking(self, image_urls: List[str], output_folder: str, media_enabled: bool = False, min_image_size: int = 0, referer: str = '', min_pixel_dimension: int = 0, max_image_bytes: int = 0) -> Dict[str, str]

Download images applying all active filters as a pipeline.

Filter order:

max_image_bytes: HEAD check before download; reject if too large.
min_image_size: byte floor after download; delete if too small.
min_pixel_dimension: pixel check after download; delete if too small. Filters are independent and all active ones are applied.

Parameters:

self
image_urls (List[str])
output_folder (str)
media_enabled (bool) optional
min_image_size (int) optional
referer (str) optional
min_pixel_dimension (int) optional
max_image_bytes (int) optional

Returns: Dict[str, str]

_download_single_image_filtered

def _download_single_image_filtered(self, url: str, images_dir: str, counter: int, max_image_bytes: int = 0, min_image_size: int = 0, min_pixel_dimension: int = 0, referer: str = '') -> Optional[str]

Download one image and apply all active filters.

Applies max_image_bytes (pre-download HEAD), min_image_size (post-download byte floor), and min_pixel_dimension (post-download pixel floor) in sequence. Any failing filter removes the file and returns None.

Parameters:

self
url (str)
images_dir (str)
counter (int)
max_image_bytes (int) optional
min_image_size (int) optional
min_pixel_dimension (int) optional
referer (str) optional

Returns: Optional[str]

⚠️ High complexity: 14

_has_explicit_source_config

def _has_explicit_source_config(self, source_config: Dict) -> bool

Check if source has explicit configuration (not a generic/discovered source).

Parameters:

self
source_config (Dict)

Returns: bool

_download_single_image_simple

def _download_single_image_simple(self, url: str, images_dir: str, counter: int, referer: str = '') -> Optional[str]

Download single image with simple error handling.

Parameters:

self
url (str)
images_dir (str)
counter (int)
referer (str) optional

Returns: Optional[str]

_download_single_image_with_min_size

def _download_single_image_with_min_size(self, url: str, images_dir: str, counter: int, min_size: int, referer: str = '') -> Optional[str]

Download single image with minimum size filtering.

Parameters:

self
url (str)
images_dir (str)
counter (int)
min_size (int)
referer (str) optional

Returns: Optional[str]

_download_single_image_with_min_pixels

def _download_single_image_with_min_pixels(self, url: str, images_dir: str, counter: int, min_pixel_dimension: int, referer: str = '') -> Optional[str]

Download image and reject it if both dimensions are below min_pixel_dimension.

Parameters:

self
url (str)
images_dir (str)
counter (int)
min_pixel_dimension (int)
referer (str) optional

Returns: Optional[str]

_download_single_image_with_max_bytes

def _download_single_image_with_max_bytes(self, url: str, images_dir: str, counter: int, max_bytes: int, referer: str = '') -> Optional[str]

Skip download if content-length exceeds max_bytes; download otherwise.

Parameters:

self
url (str)
images_dir (str)
counter (int)
max_bytes (int)
referer (str) optional

Returns: Optional[str]

_download_single_image_with_size_check

def _download_single_image_with_size_check(self, url: str, images_dir: str, counter: int, remaining_bytes: int, allow_large_files: bool = False) -> Tuple[Optional[str], int]

Download single image with size checking and return (filename, size).

Parameters:

self
url (str)
images_dir (str)
counter (int)
remaining_bytes (int)
allow_large_files (bool) optional

Returns: Tuple[Optional[str], int]

_download_single_image

def _download_single_image(self, url: str, images_dir: str, counter: int) -> Optional[str]

Download single image and return filename (legacy method).

Parameters:

self
url (str)
images_dir (str)
counter (int)

Returns: Optional[str]

_generate_filename

def _generate_filename(self, url: str, content_type: Optional[str] = None) -> str

Generate clean filename from URL.

Parameters:

self
url (str)
content_type (Optional[str]) optional

Returns: str

_get_extension_from_content_type

def _get_extension_from_content_type(self, content_type: str) -> Optional[str]

Get file extension from content type.

Parameters:

self
content_type (str)

Returns: Optional[str]

_get_extension_from_url_or_content

def _get_extension_from_url_or_content(self, url: str, content_type: Optional[str] = None) -> str

Get file extension from URL or content type, defaulting to .jpg.

Parameters:

self
url (str)
content_type (Optional[str]) optional

Returns: str

replace_image_urls

def replace_image_urls(markdown_content: str, url_mapping: Dict[str, str]) -> str

Clean URL replacement in markdown content. DRY approach with systematic pattern matching.

Parameters:

markdown_content (str)
url_mapping (Dict[str, str])

Returns: str

_apply_url_patterns

def _apply_url_patterns(content: str, original_url: str, local_path: str) -> str

Apply systematic URL replacement patterns.

Only uses exact-match strategies. Broad fallback patterns (basename matching, base-URL-without-query) are unsafe: CDN proxy URLs like /.netlify/images or /_next/image share the same path across all images and differ only in query parameters, so a loose regex overwrites every image reference to the last one processed.

Parameters:

content (str)
original_url (str)
local_path (str)

Returns: str

Functions

get_image_processor

def get_image_processor(session: Optional[requests.Session] = None) -> ImageProcessor

Get ImageProcessor instance.

Parameters:

session (Optional[requests.Session]) optional

Returns: ImageProcessor