capcat.core.html_post_processor
File: Application/capcat/core/html_post_processor.py
Description
HTML Post-Processor for Capcat Archives Handles post-processing HTML generation after article scraping is complete. Creates directory indices, article pages, and manages the complete web view system.
Constants
_TEMPLATE_MARKER
Value: '<!-- capcat-template-v3 -->'
Classes
HTMLPostProcessor
Post-processing HTML generation system. Handles directory traversal, index generation, and browser launching.
Methods
init
def __init__(self)
Parameters:
self
process_directory_tree
def process_directory_tree(self, root_path: str, incremental: bool = True, is_single_article: bool = False) -> str
Process an entire directory tree and generate HTML files.
Args: root_path: Root directory path to process incremental: If True, only process changed/missing articles is_single_article: If True, skip directory index creation (for single command)
Returns: URL of the main index page or article.html for single articles
Parameters:
selfroot_path(str)incremental(bool) optionalis_single_article(bool) optional
Returns: str
_process_article_files
def _process_article_files(self, root_path: Path, incremental: bool = True) -> None
Process article.md and comments.md files with intelligent caching.
Parameters:
selfroot_path(Path)incremental(bool) optional
Returns: None
_should_process_article
def _should_process_article(self, article_dir: Path) -> bool
Determine if article should be processed based on intelligent caching.
Returns True if:
- HTML files don’t exist, OR
- Existing HTML was generated by the old system (missing template marker), OR
- Source files are newer than HTML files
Parameters:
selfarticle_dir(Path)
Returns: bool
_is_article_directory
def _is_article_directory(self, directory: Path) -> bool
Check if directory contains article content.
Parameters:
selfdirectory(Path)
Returns: bool
_get_source_config
def _get_source_config(self, article_dir: Path) -> Optional[Dict]
Get source configuration if it has template metadata.
Parameters:
selfarticle_dir(Path)
Returns: Optional[Dict]
_process_article_directory
def _process_article_directory(self, article_dir: Path, progress = None) -> None
Process a single article directory to generate HTML files.
Parameters:
selfarticle_dir(Path)progressoptional
Returns: None
_generate_directory_indices
def _generate_directory_indices(self, root_path: Path) -> None
Generate index.html files for all directories.
Parameters:
selfroot_path(Path)
Returns: None
_should_have_index
def _should_have_index(self, directory: Path) -> bool
Determine if directory should have an index.html file.
Parameters:
selfdirectory(Path)
Returns: bool
_create_directory_index
def _create_directory_index(self, directory: Path) -> None
Create news.html for a specific directory.
Skip news.html for Capcats single article directories since they only contain one article and index.html is sufficient.
Parameters:
selfdirectory(Path)
Returns: None
_is_capcats_single_article
def _is_capcats_single_article(self, directory: Path) -> bool
Check if directory is a Capcats single article capture.
Returns True if:
- Parent directory is named “Capcats”
- This indicates a single article capture, not a News archive
Examples: Capcats/Sam-Altman-Article/ -> True (skip news.html) Capcats/InfoQ_26-10-2025/ -> True (skip news.html) News_26-10-2025/BBC_26-10-2025/ -> False (keep news.html)
Parameters:
selfdirectory(Path)
Returns: bool
_create_main_index
def _create_main_index(self, root_path: Path, index_path: Path) -> None
Create the main index.html at the root level.
Parameters:
selfroot_path(Path)index_path(Path)
Returns: None
_build_breadcrumb_path
def _build_breadcrumb_path(self, current_path: Path) -> List[str]
Build breadcrumb navigation path for a given directory.
Walks up the directory tree and stops at (and includes) the date folder (News_DD-MM-YYYY). Source folders are included as intermediate levels, not as stopping points.
Parameters:
selfcurrent_path(Path)
Returns: List[str]
_is_archive_root
def _is_archive_root(self, path: Path) -> bool
Check if path is an archive root directory.
Recognises:
- Date folder: News_DD-MM-YYYY (startswith “news_”)
- Source folder: Source-Name_DD-MM-YYYY (canonical format only)
Parameters:
selfpath(Path)
Returns: bool
_detect_output_mode
def _detect_output_mode(self, path: Path) -> str
Detect output mode based on directory structure.
Args: path: Path to check (article directory or any path in the tree)
Returns: ‘batch’ for standard news archive structure ‘custom’ for custom –output directories
Parameters:
selfpath(Path)
Returns: str
_get_index_filename
def _get_index_filename(self, output_mode: str) -> str
Get the appropriate index filename based on output mode.
Args: output_mode: Either ‘batch’ or ‘custom’
Returns: ‘news.html’ for batch mode ‘index.html’ for custom mode or unknown modes
Parameters:
selfoutput_mode(str)
Returns: str
_extract_title_from_markdown
def _extract_title_from_markdown(self, markdown_path: Path) -> str
Extract the article title from the markdown file’s H1 heading. Falls back to using the folder name if no H1 is found.
Args: markdown_path: Path to the markdown file
Returns: The article title string
Parameters:
selfmarkdown_path(Path)
Returns: str
_write_html_file
def _write_html_file(self, file_path: Path, content: str) -> None
Write HTML content to file.
Parameters:
selffile_path(Path)content(str)
Returns: None
launch_browser
def launch_browser(self, index_url: str) -> bool
Display URL for browser opening (platform-agnostic approach).
Args: index_url: File URL to the main index
Returns: True if URL was displayed successfully
Parameters:
selfindex_url(str)
Returns: bool
Functions
process_html_generation
def process_html_generation(directory_path: str, incremental: bool = True, is_single_article: bool = False) -> Optional[str]
Convenience function to process HTML generation for a directory.
Args: directory_path: Path to the directory to process incremental: If True, only process recently modified files (default: True) is_single_article: If True, skip directory index creation (for single command)
Returns: URL of the generated index page or article.html, or None if failed
Parameters:
directory_path(str)incremental(bool) optionalis_single_article(bool) optional
Returns: Optional[str]
launch_web_view
def launch_web_view(directory_path: str, incremental: bool = True, is_single_article: bool = False) -> bool
Generate HTML files and display browser URL for a directory.
Args: directory_path: Path to the directory to process incremental: If True, only process recently modified files (default: True) is_single_article: If True, skip directory index creation (for single command)
Returns: True if successful, False otherwise
Parameters:
directory_path(str)incremental(bool) optionalis_single_article(bool) optional
Returns: bool