Complete API documentation for Capcat's hybrid architecture components.
Central registry for source discovery and management.
from core.source_system.source_registry import get_source_registry, SourceRegistry
__init__(sources_dir: str = None)Initialize the source registry.
sources_dir (str, optional): Path to sources directory. Defaults to sources/active/registry = SourceRegistry()
# or
registry = SourceRegistry("/custom/sources/path")
discover_sources() -> Dict[str, SourceConfig]Discover all available sources (config-driven and custom).
Dict[str, SourceConfig]: Mapping of source names to configurationsSourceError: If source discovery failssources = registry.discover_sources()
print(f"Discovered {len(sources)} sources")
for name, config in sources.items():
print(f"- {name}: {config.display_name}")
get_source(source_name: str, session=None) -> BaseSourceGet a source instance by name.
source_name (str): Name of the sourcesession (requests.Session, optional): HTTP session to useBaseSource: Source instanceSourceError: If source not found or cannot be instantiatedsource = registry.get_source('hn')
articles = source.get_articles(count=10)
get_available_sources() -> List[str]Get list of available source names.
List[str]: List of source namesavailable = registry.get_available_sources()
print(f"Available sources: {', '.join(available)}")
validate_all_sources(deep_validation: bool = False) -> Dict[str, List[str]]Validate all registered sources.
deep_validation (bool): Whether to perform network testsDict[str, List[str]]: Source names mapped to validation errorserrors = registry.validate_all_sources(deep_validation=True)
for source, error_list in errors.items():
if error_list:
print(f"{source}: {error_list}")
Get the global source registry instance.
SourceRegistry: Global registry instanceregistry = get_source_registry()
sources = registry.get_available_sources()
Abstract base class for all source implementations.
from core.source_system.base_source import BaseSource, SourceConfig
__init__(config: SourceConfig, session=None)Initialize the base source.
config (SourceConfig): Source configurationsession (requests.Session, optional): HTTP sessionclass CustomSource(BaseSource):
def __init__(self, config: SourceConfig, session=None):
super().__init__(config, session)
get_articles(count: int = 30) -> List[Dict]count (int): Number of articles to fetchList[Dict]: Articles with keys: title, url, summarydef get_articles(self, count: int = 30) -> List[Dict]:
response = self.session.get(self.config.base_url)
soup = self._get_soup(response.text)
articles = []
for elem in soup.select('.article-link'):
articles.append({
'title': elem.get_text(strip=True),
'url': self._resolve_url(elem['href']),
'summary': ''
})
return articles[:count]
get_article_content(url: str) -> Optional[str]Get full content for a specific article.
url (str): Article URLOptional[str]: Article content as HTMLdef get_article_content(self, url: str) -> Optional[str]:
response = self.session.get(url)
soup = self._get_soup(response.text)
content = soup.select_one('.article-content')
return str(content) if content else None
get_comments(url: str) -> List[Dict]Get comments for an article.
url (str): Article URLList[Dict]: Comments with keys: author, text, timestampdef get_comments(self, url: str) -> List[Dict]:
if not self.config.supports_comments:
return []
response = self.session.get(url)
soup = self._get_soup(response.text)
comments = []
for elem in soup.select('.comment'):
comments.append({
'author': elem.select_one('.author').get_text(strip=True),
'text': elem.select_one('.text').get_text(strip=True),
'timestamp': elem.select_one('.timestamp').get('datetime')
})
return comments
validate_config() -> List[str]Validate source-specific configuration.
List[str]: List of validation errorsdef validate_config(self) -> List[str]:
errors = []
if not self.config.base_url:
errors.append("base_url is required")
if not self.config.base_url.startswith('https://'):
errors.append("base_url must use HTTPS")
return errors
_get_soup(html: str) -> BeautifulSoupParse HTML content.
html (str): HTML contentBeautifulSoup: Parsed HTML_resolve_url(url: str) -> strResolve relative URLs to absolute URLs.
url (str): Relative or absolute URLstr: Absolute URL_clean_text(text: str) -> strClean and normalize text content.
text (str): Raw textstr: Cleaned textConfiguration data class for sources.
from core.source_system.base_source import SourceConfig
__init__(...)Initialize source configuration.
name (str): Source namedisplay_name (str): Human-readable display namebase_url (str): Base URL for the sourcetimeout (float): Request timeout in secondsrate_limit (float): Minimum seconds between requestssupports_comments (bool): Whether source supports commentscategory (str): Source categorycustom_config (Dict): Additional configurationconfig = SourceConfig(
name="example",
display_name="Example News",
base_url="https://example.com/",
timeout=10.0,
rate_limit=1.0,
supports_comments=True,
category="tech",
custom_config={"api_key": "secret"}
)
Performance monitoring and metrics collection.
from core.source_system.performance_monitor import PerformanceMonitor, get_performance_monitor
record_request(source_name: str, success: bool, response_time: float)Record a request for performance tracking.
source_name (str): Name of the sourcesuccess (bool): Whether request was successfulresponse_time (float): Response time in secondsmonitor = get_performance_monitor()
monitor.record_request('hn', True, 2.5)
get_source_metrics(source_name: str) -> SourceMetricsGet performance metrics for a source.
source_name (str): Name of the sourceSourceMetrics: Performance metricsmetrics = monitor.get_source_metrics('hn')
print(f"Success rate: {metrics.success_rate:.1f}%")
print(f"Avg response time: {metrics.avg_response_time:.2f}s")
get_all_metrics() -> Dict[str, SourceMetrics]Get metrics for all sources.
Dict[str, SourceMetrics]: All source metricsall_metrics = monitor.get_all_metrics()
for source, metrics in all_metrics.items():
print(f"{source}: {metrics.success_rate:.1f}% success")
generate_performance_report() -> strGenerate human-readable performance report.
str: Performance reportreport = monitor.generate_performance_report()
print(report)
Get the global performance monitor instance.
PerformanceMonitor: Global monitor instanceConfiguration validation and quality assurance.
from core.source_system.validation_engine import ValidationEngine, ValidationResult
validate_source_config(config: SourceConfig) -> List[ValidationResult]Validate a single source configuration.
config (SourceConfig): Source configurationList[ValidationResult]: Validation resultsengine = ValidationEngine()
results = engine.validate_source_config(config)
for result in results:
if not result.is_valid:
print(f"Error: {result.message}")
validate_all_sources(configs: Dict[str, SourceConfig], deep_validation: bool = False)Validate all source configurations.
configs (Dict[str, SourceConfig]): Source configurationsdeep_validation (bool): Whether to perform network testsDict[str, List[ValidationResult]]: Validation results by sourceresults = engine.validate_all_sources(configs, deep_validation=True)
generate_validation_report(results: Dict) -> strGenerate comprehensive validation report.
results (Dict): Validation results from validate_all_sourcesstr: Markdown validation reportreport = engine.generate_validation_report(results)
with open('validation_report.md', 'w') as f:
f.write(report)
is_valid (bool): Whether validation passedmessage (str): Validation messageseverity (str): Severity level ("error", "warning", "info")category (str): Category ("network", "config", "selectors", "general")result = ValidationResult(
is_valid=False,
message="Invalid CSS selector",
severity="error",
category="selectors"
)
Global session management for optimal performance.
from core.session_pool import get_global_session, SessionPool
Get the global HTTP session instance.
requests.Session: Optimized session with connection poolingsession = get_global_session()
response = session.get('https://example.com/')
get_session() -> requests.SessionGet a configured session instance.
requests.Session: Configured sessionpool = SessionPool()
session = pool.get_session()
Core article content fetching with separated concerns.
from core.article_fetcher import ArticleFetcher
fetch_article_content(title: str, url: str, index: int, base_folder: str, progress_callback=None) -> Tuple[bool, Optional[str], Optional[str]]Fetch and save article content in markdown format.
title (str): Article titleurl (str): Article URLindex (int): Article index numberbase_folder (str): Base directory to save toprogress_callback (Optional[Callable]): Progress update callbackTuple[bool, Optional[str], Optional[str]]: (success, article_folder_path, article_title)fetcher = ArticleFetcher()
success, folder_path, title = fetcher.fetch_article_content(
title="Example Article",
url="https://example.com/article",
index=1,
base_folder="/path/to/articles"
)
if success:
print(f"Article saved to: {folder_path}")
# Article Title
Original article content...
## Additional Images
*The following images were found using comprehensive page scanning:*


Central interface for unified media processing across all sources.
from core.unified_media_processor import UnifiedMediaProcessor
process_article_media(content: str, html_content: str, url: str, article_folder: str, source_name: str, session: requests.Session) -> strProcess all media embedding for an article using the unified system.
content (str): Markdown content of the articlehtml_content (str): Original HTML contenturl (str): Source URL of the articlearticle_folder (str): Path to article foldersource_name (str): Name of the news source (for configuration)session (requests.Session): HTTP session for downloadingstr: Updated markdown content with local image referencesupdated_content = UnifiedMediaProcessor.process_article_media(
content=markdown_content,
html_content=html_content,
url=article_url,
article_folder=article_folder,
source_name="futurism",
session=session
)
add_media_config_to_source(source_config_path: str, source_name: str) -> NoneAdd media processing configuration to a source's config file.
source_config_path (str): Path to source's config.yaml filesource_name (str): Name of the sourceget_integration_code_example(source_name: str) -> strGet example code for integrating unified media processing into a source.
source_name (str): Name of the sourcestr: Example Python code for integrationCore engine for media extraction, downloading, and URL replacement.
from core.media_embedding_processor import MediaEmbeddingProcessor
__init__(source_config: Dict[str, Any], session: requests.Session)Initialize media processor with source-specific configuration.
source_config (Dict[str, Any]): Source-specific media processing configurationsession (requests.Session): HTTP session for downloadingprocess_media_embedding(content: str, soup: BeautifulSoup, article_folder: str, base_url: str) -> strMain method to process all media embedding.
content (str): Markdown contentsoup (BeautifulSoup): BeautifulSoup object of original HTMLarticle_folder (str): Path to article folderbase_url (str): Base URL of the sourcestr: Updated markdown content with local image referencesManages media processing configurations for different news sources.
from core.media_config import MediaConfigManager
get_source_config(source_name: str) -> Dict[str, Any]Get media processing configuration for a specific source.
source_name (str): Name of the news sourceDict[str, Any]: Dictionary with media processing configurationconfig = MediaConfigManager.get_source_config('futurism')
# Returns:
# {
# 'media_processing': {
# 'hero_image_selectors': ['.featured-image img', '.post-thumbnail img'],
# 'url_patterns': {'wordpress': ['/wp-content/uploads/']},
# 'quality_thresholds': {'min_width': 150, 'min_height': 150}
# }
# }
get_all_source_names() -> List[str]Get list of all configured source names.
List[str]: List of configured source namesComments are fetched independently from articles using source-specific methods.
# Article and comments are fetched separately
source = registry.get_source('hn')
# 1. Fetch article content
success, folder_path, title = source.fetch_article_content(
title=article.title,
url=article.url,
index=0,
base_folder=output_dir
)
# 2. Fetch comments separately if article was successful
if success and article.comment_url:
source.fetch_comments(
comment_url=article.comment_url,
article_title=title,
article_folder_path=folder_path
)
# Similar pattern for Lobsters
source = registry.get_source('lb')
# Article fetching
success, folder_path, title = source.fetch_article_content(...)
# Independent comment fetching
if success and comment_url:
source.fetch_comments(comment_url, title, folder_path)
try:
# Article fetching
success, folder, title = fetcher.fetch_article_content(...)
# Comment fetching (independent)
if success and comment_url:
try:
source.fetch_comments(comment_url, title, folder)
except Exception as e:
logger.debug(f"Comments failed: {e}")
# Article is still saved successfully
except Exception as e:
logger.error(f"Article fetch failed: {e}")
Network-related configuration.
from core.config import NetworkConfig
config = NetworkConfig(
connect_timeout=10,
read_timeout=8,
user_agent="Custom User Agent"
)
Processing-related configuration.
from core.config import ProcessingConfig
config = ProcessingConfig(
max_workers=8,
download_images=True,
download_videos=False
)
Performance metrics data class.
from core.source_system.performance_monitor import SourceMetrics
@dataclass
class SourceMetrics:
source_name: str
source_type: str
total_requests: int = 0
successful_requests: int = 0
avg_response_time: float = 0.0
articles_discovered: int = 0
articles_fetched: int = 0
articles_failed: int = 0
last_error: Optional[str] = None
@property
def success_rate(self) -> float:
return (self.successful_requests / self.total_requests) * 100
Base exception for source-related errors.
from core.source_system.base_source import SourceError
try:
source = registry.get_source('nonexistent')
except SourceError as e:
print(f"Source error: {e}")
Configuration-related errors.
from core.source_system.base_source import ConfigurationError
try:
config = load_invalid_config()
except ConfigurationError as e:
print(f"Configuration error: {e}")
from typing import List, Dict, Optional
from core.source_system.base_source import BaseSource, SourceConfig
class ExampleSource(BaseSource):
def get_articles(self, count: int = 30) -> List[Dict]:
response = self.session.get(self.config.base_url)
response.raise_for_status()
soup = self._get_soup(response.text)
articles = []
for link in soup.select('.article-link')[:count]:
articles.append({
'title': self._clean_text(link.get_text()),
'url': self._resolve_url(link['href']),
'summary': ''
})
return articles
def get_article_content(self, url: str) -> Optional[str]:
response = self.session.get(url)
soup = self._get_soup(response.text)
content = soup.select_one('.article-content')
return str(content) if content else None
def validate_config(self) -> List[str]:
errors = []
if not self.config.base_url:
errors.append("base_url is required")
return errors
# Get registry and discover sources
registry = get_source_registry()
available = registry.get_available_sources()
# Create source instance
source = registry.get_source('hn')
# Fetch articles
articles = source.get_articles(count=10)
# Get performance metrics
monitor = get_performance_monitor()
metrics = monitor.get_source_metrics('hn')
This API reference covers all public interfaces in the Capcat hybrid architecture. For implementation examples, see the Source Development Guide.