capcat.core.unified_source_processor
File: Application/capcat/core/unified_source_processor.py
Description
Unified Source Processor for Capcat. This eliminates the 46+ duplicate process_*_articles functions by providing a single, configurable processor that works with all sources.
Follows DRY principle while maintaining source-specific optimizations.
Constants
MANIFEST_FILENAME
Value: '.capcat_fetched.json'
NEW_SOURCE_SYSTEM_AVAILABLE
Value: True
MIRROR_AVAILABLE
Value: True
NEW_SOURCE_SYSTEM_AVAILABLE
Value: False
MIRROR_AVAILABLE
Value: False
Classes
FetchResult
UnifiedSourceProcessor
Unified processor for all news sources. Eliminates code duplication while preserving source-specific functionality.
Methods
init
def __init__(self, project_root: Optional[Path] = None)
Parameters:
selfproject_root(Optional[Path]) optional
_is_source_in_new_system
def _is_source_in_new_system(self, source_name: str) -> bool
Check if source is available in the new source system.
Parameters:
selfsource_name(str)
Returns: bool
_assert_all_sources_in_new_system
def _assert_all_sources_in_new_system(self) -> None
Assert that every source in the legacy config is also registered in the new system. Raises ValueError listing any sources that would fall through to the deleted legacy path.
Parameters:
self
Returns: None
process_source_articles
def process_source_articles(self, source_name: str, count: Optional[int], output_dir: str, quiet: bool = False, verbose: bool = False, download_files: bool = False, batch_mode: bool = False, generate_html: bool = False, download_pdfs: bool = False, force_no_pdfs: bool = False) -> None
Universal article processing function. All sources route through the new system.
Args: source_name: The source identifier (e.g., ‘hn’, ‘bbc’, ‘cnn’) count: Number of articles to fetch quiet: Suppress progress output verbose: Enable verbose logging download_files: Enable media file downloads batch_mode: Whether processing multiple sources (affects retry messages) generate_html: Generate HTML version after fetching download_pdfs: Enable PDF downloads (–pdfs flag) force_no_pdfs: True when the TUI user explicitly answered ‘No’ to PDFs.
Parameters:
selfsource_name(str)count(Optional[int])output_dir(str)quiet(bool) optionalverbose(bool) optionaldownload_files(bool) optionalbatch_mode(bool) optionalgenerate_html(bool) optionaldownload_pdfs(bool) optionalforce_no_pdfs(bool) optional
Returns: None
_process_with_new_system
def _process_with_new_system(self, source_name: str, count: Optional[int], output_dir: str, quiet: bool = False, verbose: bool = False, download_files: bool = False, batch_mode: bool = False, generate_html: bool = False, download_pdfs: bool = False, force_no_pdfs: bool = False) -> None
Process articles using the new source system.
Parameters:
selfsource_name(str)count(Optional[int])output_dir(str)quiet(bool) optionalverbose(bool) optionaldownload_files(bool) optionalbatch_mode(bool) optionalgenerate_html(bool) optionaldownload_pdfs(bool) optionalforce_no_pdfs(bool) optional
Returns: None
⚠️ High complexity: 22
_process_articles_with_new_system
def _process_articles_with_new_system(self, source, articles, base_dir: str, download_files: bool, quiet: bool, verbose: bool, download_pdfs: bool = False, manifest: dict = None)
Process articles using the new source system with parallel execution.
Parameters:
selfsourcearticlesbase_dir(str)download_files(bool)quiet(bool)verbose(bool)download_pdfs(bool) optionalmanifest(dict) optional
⚠️ High complexity: 14
_process_single_article_new_system
def _process_single_article_new_system(self, source, article, base_dir: str, download_files: bool, progress_tracker = None, index: int = 1, download_pdfs: bool = False) -> bool
Process a single article using the new source system.
Parameters:
selfsourcearticlebase_dir(str)download_files(bool)progress_trackeroptionalindex(int) optionaldownload_pdfs(bool) optional
Returns: bool
⚠️ High complexity: 23
Functions
load_manifest
def load_manifest(base_dir: str) -> dict
Load the URL manifest from a source output directory.
Returns a dict mapping URL -> folder_name. Returns empty dict if the manifest does not exist or is corrupt.
Parameters:
base_dir(str)
Returns: dict
save_manifest
def save_manifest(base_dir: str, manifest: dict) -> None
Write the URL manifest to a source output directory.
Parameters:
base_dir(str)manifest(dict)
Returns: None
filter_already_fetched
def filter_already_fetched(articles: list, manifest: dict) -> tuple
Partition articles into new (not in manifest) and count of skipped.
Args: articles: List of Article objects with a .url attribute. manifest: Dict mapping URL -> folder_name from a previous run.
Returns: (new_articles, skipped_count) tuple.
Parameters:
articles(list)manifest(dict)
Returns: tuple
_resolve_count
def _resolve_count(cli_count: Optional[int], source_config: 'SourceConfig', config = None) -> int
Resolve article count: CLI flag > capcat.yml sources list > source YAML > global config default.
Args: cli_count: Value from –count flag, or None if not provided. source_config: The source’s SourceConfig (has article_count field). config: FetchNewsConfig instance (used for vault overrides and global fallback).
Returns: Number of articles to fetch.
Parameters:
cli_count(Optional[int])source_config(‘SourceConfig’)configoptional
Returns: int
_resolve_media
def _resolve_media(download_files: bool, download_pdfs: bool, source_config: 'SourceConfig', config = None, force_no_pdfs: bool = False) -> tuple
Resolve (download_files, download_pdfs) from 4-level hierarchy.
Priority: CLI flags > capcat.yml source entry > source config.yaml > Global-settings.yaml.
Args: download_files: True if –media CLI flag was passed. download_pdfs: True if –pdfs CLI flag was passed. source_config: SourceConfig instance for this source. config: FetchNewsConfig instance. Defaults to get_config(). force_no_pdfs: True when the user explicitly answered ‘No’ to the TUI PDF prompt. Overrides all config-level pdf settings.
Returns: (download_files, download_pdfs) tuple of resolved booleans.
Parameters:
download_files(bool)download_pdfs(bool)source_config(‘SourceConfig’)configoptionalforce_no_pdfs(bool) optional
Returns: tuple
⚠️ High complexity: 11
_build_article_metadata
def _build_article_metadata(article, source) -> dict
Build frontmatter metadata dict for an article.
Parameters:
articlesource
Returns: dict
_build_comments_metadata
def _build_comments_metadata(article, source) -> dict
Build frontmatter metadata dict for a comments file.
Parameters:
articlesource
Returns: dict
get_unified_processor
def get_unified_processor(project_root: Optional[Path] = None) -> UnifiedSourceProcessor
Get global unified processor instance.
Parameters:
project_root(Optional[Path]) optional
Returns: UnifiedSourceProcessor
process_source_articles
def process_source_articles(source_name: str, count: Optional[int], output_dir: str, quiet: bool = False, verbose: bool = False, download_files: bool = False, batch_mode: bool = False, generate_html: bool = False, project_root: Optional[Path] = None, download_pdfs: bool = False, force_no_pdfs: bool = False) -> None
Convenience function to process articles from any source. This replaces all 46+ individual process_*_articles functions.
Args: source_name: The source identifier (e.g., ‘hn’, ‘bbc’, ‘cnn’) count: Number of articles to fetch quiet: Suppress progress output verbose: Enable verbose logging download_files: Enable media file downloads batch_mode: Whether processing multiple sources (affects retry messages) project_root: Optional project root override download_pdfs: Enable PDF downloads (–pdfs flag) force_no_pdfs: True when the TUI user explicitly answered ‘No’ to PDFs.
Parameters:
source_name(str)count(Optional[int])output_dir(str)quiet(bool) optionalverbose(bool) optionaldownload_files(bool) optionalbatch_mode(bool) optionalgenerate_html(bool) optionalproject_root(Optional[Path]) optionaldownload_pdfs(bool) optionalforce_no_pdfs(bool) optional
Returns: None
progress_callback
def progress_callback(progress: float, stage: str)
Parameters:
progress(float)stage(str)