fmhy-bookmarks/make_fmhy_bookmarks.py

"""Generate FMHY bookmark HTML files from FMHY markdown sections."""

from __future__ import annotations

import asyncio
import base64
import logging
import re
from dataclasses import dataclass
from typing import Dict, List, Tuple

import aiohttp

# Configure logging
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class Config:
    """Configuration constants for the FMHY bookmark generator."""

    site_base_url: str = "https://fmhy.net/"
    reddit_base_url: str = "https://www.reddit.com/r/FREEMEDIAHECKYEAH/wiki/"
    base64_rentry_url: str = "https://rentry.co/FMHYBase64/raw"
    github_raw_base: str = (
        "https://raw.githubusercontent.com/fmhy/edit/refs/heads/main/docs/"
    )
    folder_name: str = "FMHY"
    decode_base64: bool = True


@dataclass
class BookmarkLine:
    """Represents one original content line at a leaf."""

    is_starred: bool  # line contains ⭐ or 🌟
    description_raw: str  # raw trailing text after last ")", may be empty
    links: List[Tuple[str, str]]  # list of (title, url) exactly as matched


@dataclass
class WikiSection:
    """Represents a wiki section to be processed."""

    filename: str
    icon: str
    url_key: str


CONFIG = Config()


def parse_heading(line: str, sub_url: str) -> Tuple[str, str]:
    """Parse heading line and return (subcategory, subsubcategory)."""
    if sub_url != "storage":
        if line.startswith("# ►"):
            return line.replace("# ►", "").strip(), "/"
        elif line.startswith("## ▷"):
            return "", line.replace("## ▷", "").strip()
    else:  # storage section uses different heading levels
        if line.startswith("## "):
            return line.replace("## ", "").strip(), "/"
        elif line.startswith("### "):
            return "", line.replace("### ", "").strip()
    return "", ""


def clean_category_name(category: str) -> str:
    """Remove URLs from category names."""
    return "" if "http" in category else category


def add_hierarchy_prefix(
    lines: List[str], section_name: str, sub_url: str
) -> List[str]:
    """Add hierarchy prefix to content lines."""
    modified_lines = []
    curr_subcat = ""
    curr_subsubcat = ""

    for line in lines:
        if line.startswith("#"):  # Heading line
            subcat, subsubcat = parse_heading(line, sub_url)
            if subcat:
                curr_subcat = clean_category_name(subcat)
            if subsubcat:
                curr_subsubcat = clean_category_name(subsubcat)
        elif any(char.isalpha() for char in line):  # Content line
            prefix = f'{{"{section_name.replace(".md", "")}", "{curr_subcat}", "{curr_subsubcat}"}}'
            content = line[2:] if line.startswith("* ") else line
            modified_lines.append(prefix + content)

    return modified_lines


# Base64 processing functions
def fix_base64_padding(encoded_string: str) -> str:
    """Fix base64 padding."""
    missing_padding = len(encoded_string) % 4
    if missing_padding:
        encoded_string += "=" * (4 - missing_padding)
    return encoded_string


def decode_base64_content(input_string: str) -> str:
    """Decode base64 content within backticks."""
    if not CONFIG.decode_base64:
        return input_string

    def base64_decode(match):
        encoded_data = match.group(0)[1:-1]  # Remove backticks
        decoded_bytes = base64.b64decode(fix_base64_padding(encoded_data))
        return decoded_bytes.decode()

    pattern = r"`[^`]+`"
    return re.sub(pattern, base64_decode, input_string)


def process_base64_sections(base64_page: str) -> List[str]:
    """Process base64 page sections."""
    sections = base64_page.split("***")
    formatted_sections = []

    for section in sections:
        # Clean up section formatting
        clean_section = (
            section.strip()
            .replace("#### ", "")
            .replace("\n\n", " - ")
            .replace("\n", ", ")
        )

        # Remove empty lines
        lines = [line for line in clean_section.split("\n") if line.strip()]
        clean_section = "\n".join(lines)

        # Decode base64 if enabled
        clean_section = decode_base64_content(clean_section)

        # Add base64 prefix
        formatted_section = (
            "[🔑Base64](https://rentry.co/FMHYBase64) ► " + clean_section
        )
        formatted_sections.append(formatted_section)

    return formatted_sections


async def download_wiki_content_async(
    session: aiohttp.ClientSession, filename: str
) -> Tuple[str, List[str]]:
    """Download and process wiki content asynchronously."""
    # First try to load locally
    try:
        with open(filename, "r", encoding="utf-8") as f:
            content = f.read()
        logger.info("Loaded %s locally", filename)

        if filename != "base64.md":
            sub_url = filename.replace(".md", "").lower()
            return filename, add_hierarchy_prefix(
                content.split("\n"), filename, sub_url
            )
        else:
            return filename, process_base64_sections(content)
    except FileNotFoundError:
        pass

    # Download remotely if not found locally
    try:
        if filename != "base64.md":
            url = CONFIG.github_raw_base + filename
        else:
            url = CONFIG.base64_rentry_url

        async with session.get(url, timeout=30) as resp:
            resp.raise_for_status()
            content = await resp.text()

            if filename == "base64.md":
                content = content.replace("\r", "")
                logger.info("Downloaded base64 page")
                return filename, process_base64_sections(content)
            else:
                logger.info("Downloaded %s", filename)
                sub_url = filename.replace(".md", "").lower()
                return filename, add_hierarchy_prefix(
                    content.split("\n"), filename, sub_url
                )

    except Exception as e:
        logger.error("Failed to fetch %s (%s). Skipping.", filename, e)
        return filename, []


async def collect_all_wiki_content_async() -> List[str]:
    """Collect and process all wiki sections concurrently."""
    async with aiohttp.ClientSession() as session:
        tasks = []
        for section in WIKI_SECTIONS:
            task = download_wiki_content_async(session, section.filename)
            tasks.append(task)

        logger.info("Starting concurrent fetching of %d sections...", len(tasks))
        results = await asyncio.gather(*tasks, return_exceptions=True)

        all_lines = []
        for result in results:
            if isinstance(result, Exception):
                logger.error("Download task failed: %s", result)
                continue
            filename, lines = result
            all_lines.extend(lines)

        return all_lines


# Wiki sections to process
WIKI_SECTIONS = [
    WikiSection("video.md", "📺", "video"),
    WikiSection("ai.md", "🤖", "ai"),
    WikiSection("mobile.md", "📱", "mobile"),
    WikiSection("audio.md", "🎵", "audio"),
    WikiSection("downloading.md", "💾", "downloading"),
    WikiSection("educational.md", "🧠", "educational"),
    WikiSection("gaming.md", "🎮", "gaming"),
    WikiSection("privacy.md", "📛", "privacy"),
    WikiSection("system-tools.md", "💻", "system-tools"),
    WikiSection("file-tools.md", "🗃️", "file-tools"),
    WikiSection("internet-tools.md", "🔗", "internet-tools"),
    WikiSection("social-media-tools.md", "💬", "social-media-tools"),
    WikiSection("text-tools.md", "📝", "text-tools"),
    WikiSection("video-tools.md", "📼", "video-tools"),
    WikiSection("misc.md", "📂", "misc"),
    WikiSection("reading.md", "📗", "reading"),
    WikiSection("torrenting.md", "🌀", "torrenting"),
    WikiSection("image-tools.md", "📷", "image-tools"),
    WikiSection("gaming-tools.md", "👾", "gaming-tools"),
    WikiSection("linux-macos.md", "🐧🍏", "linux-macos"),
    WikiSection("developer-tools.md", "🖥️", "developer-tools"),
    WikiSection("non-english.md", "🌏", "non-english"),
    WikiSection("storage.md", "🗄️", "storage"),
    WikiSection("base64.md", "🔑", "base64"),
    WikiSection("unsafe.md", "🌶", "unsafe"),
]


async def main_async() -> None:
    """Main execution function (async version)."""
    logger.info("Collecting wiki content...")
    all_content = await collect_all_wiki_content_async()
    full_content = "\n".join(all_content)

    # Generate both bookmark files
    create_html_bookmarks(full_content, "fmhy_in_bookmarks.html")
    create_html_bookmarks(
        full_content, "fmhy_in_bookmarks_starred_only.html", starred_only=True
    )

    logger.info("Bookmark generation complete!")


def parse_bookmark_line(line: str) -> Tuple[str, str, str, BookmarkLine | None]:
    """Parse a line to extract hierarchy and bookmark data."""
    url_pattern = re.compile(r"\[([^\]]+)\]\((https?://[^\)]+)\)")
    hierarchy_pattern = re.compile(r'^\{"([^"]+)", "([^"]+)", "([^"]+)"\}')

    hierarchy_match = hierarchy_pattern.match(line)
    if not hierarchy_match:
        return "", "", "", None

    level1, level2, level3 = hierarchy_match.groups()
    matches = url_pattern.findall(line)

    # Remove non-primary Discord invites, X, Telegram and .onion links
    filters = {"Discord", "X", "Telegram", ".onion"}
    for matched_link in matches.copy():
        if matched_link[0] in filters:
            matches.remove(matched_link)

    # Check if line contains starred content
    is_starred = "⭐" in line or "🌟" in line

    # Extract raw description (text after last URL)
    last_paren = line.rfind(")")
    description_raw = (
        line[last_paren + 1 :].replace("**", "").strip() if last_paren != -1 else ""
    )

    bookmark_line = BookmarkLine(
        is_starred=is_starred, description_raw=description_raw, links=matches
    )

    return level1, level2, level3, bookmark_line


def generate_bookmark_html(
    bookmarks_dict: Dict[str, Dict[str, Dict[str, List[BookmarkLine]]]],
    indent: int = 1,
    starred_only: bool = False,
    path: Tuple[str, ...] = (),
) -> str:
    """Generate HTML from bookmark dictionary."""
    html = ""
    for key, value in bookmarks_dict.items():
        html += "    " * indent + f"<DT><H3>{key}</H3>\n"
        html += "    " * indent + "<DL><p>\n"

        current_path = path + (key,)

        if isinstance(value, dict):
            html += generate_bookmark_html(
                value, indent + 1, starred_only, current_path
            )
        else:
            # At leaf level - render BookmarkLine items
            # current_path should be (level1, level2, level3)
            level1, level2, level3 = (
                current_path if len(current_path) >= 3 else ("", "", "")
            )

            for bookmark_line in value:
                # Skip if starred_only mode and line is not starred
                if starred_only and not bookmark_line.is_starred:
                    continue

                # Compute effective description
                if bookmark_line.description_raw:
                    effective_description = bookmark_line.description_raw
                else:
                    # Fallback description using current hierarchy path
                    effective_description = "- " + (
                        level3 if level3 != "/" else level2 if level2 else level1
                    )

                # Determine which links to render
                links_to_render = bookmark_line.links
                if starred_only:
                    links_to_render = links_to_render[
                        :1
                    ]  # Only first link for starred content

                # Render each link
                for title, url in links_to_render:
                    anchor_text = f"{title} {effective_description}".strip()
                    html += (
                        "    " * (indent + 1)
                        + f'<DT><A HREF="{url}" ADD_DATE="0">{anchor_text}</A>\n'
                    )

        html += "    " * indent + "</DL><p>\n"
    return html


def create_html_bookmarks(
    content: str, output_file: str, starred_only: bool = False
) -> None:
    """Create HTML bookmark file from processed content."""
    bookmarks: Dict[str, Dict[str, Dict[str, List[BookmarkLine]]]] = {}

    for line in content.split("\n"):
        level1, level2, level3, bookmark_line = parse_bookmark_line(line)
        if (
            not level1 or bookmark_line is None
        ):  # Skip lines that don't match hierarchy pattern
            continue

        # Initialize nested structure
        bookmarks.setdefault(level1, {}).setdefault(level2, {}).setdefault(level3, [])
        bookmarks[level1][level2][level3].append(bookmark_line)

    # Generate HTML
    html_content = (
        "<!DOCTYPE NETSCAPE-Bookmark-file-1>\n"
        '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">\n'
        "<TITLE>Bookmarks</TITLE>\n"
        "<H1>Bookmarks</H1>\n"
        "<DL><p>\n"
        f"    <DT><H3>{CONFIG.folder_name}</H3>\n"
        "    <DL><p>\n"
        + generate_bookmark_html(bookmarks, indent=2, starred_only=starred_only)
        + "    </DL><p>\n"
        "</DL><p>\n"
    )

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html_content)

    logger.info("Created bookmark file: %s", output_file)


def main() -> None:
    """Main execution function."""
    asyncio.run(main_async())


if __name__ == "__main__":
    main()