Refactor bookmark generator and use asynchronous events (#11)

2026-03-11 08:55:39 +00:00 · 2025-09-01 19:18:47 +08:00 · 2025-09-01 19:18:47 +08:00 · 5defd48c97
commit 5defd48c97
parent 78e65e8a01
2 changed files with 346 additions and 222 deletions
--- a/make_fmhy_bookmarks.py
+++ b/make_fmhy_bookmarks.py
@ -1,269 +1,393 @@
-import requests
+"""Generate FMHY bookmark HTML files from FMHY markdown sections."""
-def addPretext(lines, sectionName, baseURL, subURL):
+from __future__ import annotations
 import asyncio
 import base64
 import logging
 import re
 from dataclasses import dataclass
 from typing import Dict, List, Tuple
 import aiohttp
 # Configure logging
 logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
 logger = logging.getLogger(__name__)
@dataclass(frozen=True)
 class Config:
    """Configuration constants for the FMHY bookmark generator."""
    site_base_url: str = "https://fmhy.net/"
    reddit_base_url: str = "https://www.reddit.com/r/FREEMEDIAHECKYEAH/wiki/"
    base64_rentry_url: str = "https://rentry.co/FMHYBase64/raw"
    github_raw_base: str = (
        "https://raw.githubusercontent.com/fmhy/edit/refs/heads/main/docs/"
    )
    folder_name: str = "FMHY"
    decode_base64: bool = True
@dataclass
 class BookmarkLine:
    """Represents one original content line at a leaf."""
    is_starred: bool  # line contains ⭐ or 🌟
    description_raw: str  # raw trailing text after last ")", may be empty
    links: List[Tuple[str, str]]  # list of (title, url) exactly as matched
@dataclass
 class WikiSection:
    """Represents a wiki section to be processed."""
    filename: str
    icon: str
    url_key: str
 CONFIG = Config()
 def parse_heading(line: str, sub_url: str) -> Tuple[str, str]:
    """Parse heading line and return (subcategory, subsubcategory)."""
    if sub_url != "storage":
        if line.startswith("# ►"):
            return line.replace("# ►", "").strip(), "/"
        elif line.startswith("## ▷"):
            return "", line.replace("## ▷", "").strip()
    else:  # storage section uses different heading levels
        if line.startswith("## "):
            return line.replace("## ", "").strip(), "/"
        elif line.startswith("### "):
            return "", line.replace("### ", "").strip()
    return "", ""
 def clean_category_name(category: str) -> str:
    """Remove URLs from category names."""
    return "" if "http" in category else category
 def add_hierarchy_prefix(
    lines: List[str], section_name: str, sub_url: str
 ) -> List[str]:
    """Add hierarchy prefix to content lines."""
    modified_lines = []
-    currMdSubheading = ""
+    curr_subcat = ""
-    currSubCat = ""
+    curr_subsubcat = ""
    currSubSubCat = ""
    #Remove from the lines any line that isnt a heading and doesnt contain the character `⭐`
    #lines = [line for line in lines if line.startswith("#") or '⭐' in line]
    #Parse headings
    for line in lines:
-        if line.startswith("#"): #Title Lines
+        if line.startswith("#"):  # Heading line
-            if not subURL=="storage":
+            subcat, subsubcat = parse_heading(line, sub_url)
-                if line.startswith("# ►"):
+            if subcat:
-                    currMdSubheading = "#" + line.replace("# ►", "").strip().replace(" / ", "-").replace(" ", "-").lower()
+                curr_subcat = clean_category_name(subcat)
-                    currSubCat = line.replace("# ►", "").strip()
+            if subsubcat:
-                    currSubSubCat = "/"
+                curr_subsubcat = clean_category_name(subsubcat)
-                elif line.startswith("## ▷"):
+        elif any(char.isalpha() for char in line):  # Content line
-                    if not subURL=="non-english": #Because non-eng section has multiple subsubcats with same names
+            prefix = f'{{"{section_name.replace(".md", "")}", "{curr_subcat}", "{curr_subsubcat}"}}'
-                        currMdSubheading = "#" + line.replace("## ▷", "").strip().replace(" / ", "-").replace(" ", "-").lower()
+            content = line[2:] if line.startswith("* ") else line
-                    currSubSubCat = line.replace("## ▷", "").strip()
+            modified_lines.append(prefix + content)
            elif subURL=="storage":
                if line.startswith("## "):
                    currMdSubheading = "#" + line.replace("## ", "").strip().replace(" / ", "-").replace(" ", "-").lower()
                    currSubCat = line.replace("## ", "").strip()
                    currSubSubCat = "/"
                elif line.startswith("### "):
                    currMdSubheading = "#" + line.replace("### ", "").strip().replace(" / ", "-").replace(" ", "-").lower()
                    currSubSubCat = line.replace("### ", "").strip()
            # Remove links from subcategory titles (because the screw the format)
            if 'http' in currSubCat: currSubCat = ''
            if 'http' in currSubSubCat: currSubSubCat = ''
        elif any(char.isalpha() for char in line): #If line has content
            preText = f"{{\"{sectionName.replace(".md", "")}\", \"{currSubCat}\", \"{currSubSubCat}\"}}"
            if line.startswith("* "): line = line[2:]
            modified_lines.append(preText + line)
    return modified_lines
-#----------------base64 page processing------------
+# Base64 processing functions
-import base64
+def fix_base64_padding(encoded_string: str) -> str:
-import re
+    """Fix base64 padding."""
 doBase64Decoding = True
 def fix_base64_string(encoded_string):
    missing_padding = len(encoded_string) % 4
-    if missing_padding != 0:
+    if missing_padding:
-        encoded_string += '=' * (4 - missing_padding)
+        encoded_string += "=" * (4 - missing_padding)
    return encoded_string
-def decode_base64_in_backticks(input_string):
+
 def decode_base64_content(input_string: str) -> str:
    """Decode base64 content within backticks."""
    if not CONFIG.decode_base64:
        return input_string
    def base64_decode(match):
-        encoded_data = match.group(0)[1:-1]  # Extract content within backticks
+        encoded_data = match.group(0)[1:-1]  # Remove backticks
-        decoded_bytes = base64.b64decode( fix_base64_string(encoded_data) )
+        decoded_bytes = base64.b64decode(fix_base64_padding(encoded_data))
        return decoded_bytes.decode()
-    pattern = r"`[^`]+`"  # Regex pattern to find substrings within backticks
+    pattern = r"`[^`]+`"
-    decoded_string = re.sub(pattern, base64_decode, input_string)
+    return re.sub(pattern, base64_decode, input_string)
    return decoded_string
 def remove_empty_lines(text):
    lines = text.split('\n')  # Split the text into lines
    non_empty_lines = [line for line in lines if line.strip()]  # Filter out empty lines
    return '\n'.join(non_empty_lines)  # Join non-empty lines back together
-def extract_base64_sections(base64_page):
+def process_base64_sections(base64_page: str) -> List[str]:
-    sections = base64_page.split("***")  # Split the input string by "***" to get sections
+    """Process base64 page sections."""
    sections = base64_page.split("***")
    formatted_sections = []
    for section in sections:
-        formatted_section = remove_empty_lines( section.strip().replace("#### ", "").replace("\n\n", " - ").replace("\n", ", ") )
+        # Clean up section formatting
-        if doBase64Decoding: formatted_section = decode_base64_in_backticks(formatted_section)
+        clean_section = (
-        formatted_section = '[🔑Base64](https://rentry.co/FMHYBase64) ► ' + formatted_section
+            section.strip()
            .replace("#### ", "")
            .replace("\n\n", " - ")
            .replace("\n", ", ")
        )
        # Remove empty lines
        lines = [line for line in clean_section.split("\n") if line.strip()]
        clean_section = "\n".join(lines)
        # Decode base64 if enabled
        clean_section = decode_base64_content(clean_section)
        # Add base64 prefix
        formatted_section = (
            "[🔑Base64](https://rentry.co/FMHYBase64) ► " + clean_section
        )
        formatted_sections.append(formatted_section)
-    lines = formatted_sections
+
-    return lines
+    return formatted_sections
 #----------------</end>base64 page processing------------
-
+async def download_wiki_content_async(
-def dlWikiChunk(fileName, icon, redditSubURL):
+    session: aiohttp.ClientSession, filename: str
-
+) -> Tuple[str, List[str]]:
-    #first, try to get the chunk locally
+    """Download and process wiki content asynchronously."""
    # First try to load locally
    try:
-        #First, try to get it from the local file
+        with open(filename, "r", encoding="utf-8") as f:
-        print("Loading " + fileName + " from local file...")
+            content = f.read()
-        with open(fileName.lower(), 'r') as f:
+        logger.info("Loaded %s locally", filename)
            page = f.read()
        print("Loaded.\n")
    #if not available locally, download the chunk
    except:
        if not fileName=='base64.md':
            print("Local file not found. Downloading " + fileName + " from Github...")
            page = requests.get("https://raw.githubusercontent.com/fmhy/FMHYedit/main/docs/" + fileName.lower()).text
        elif fileName=='base64.md':
            print("Local file not found. Downloading rentry.co/FMHYBase64...")
            page = requests.get("https://rentry.co/FMHYBase64/raw").text.replace("\r", "")
        print("Downloaded")
-    #add a pretext
+        if filename != "base64.md":
-    redditBaseURL = "https://www.reddit.com/r/FREEMEDIAHECKYEAH/wiki/"
+            sub_url = filename.replace(".md", "").lower()
-    siteBaseURL = "https://fmhy.net/"
+            return filename, add_hierarchy_prefix(
-    if not fileName=='base64.md':
+                content.split("\n"), filename, sub_url
-        pagesDevSiteSubURL = fileName.replace(".md", "").lower()
+            )
-        subURL = pagesDevSiteSubURL
+        else:
-        lines = page.split('\n')
+            return filename, process_base64_sections(content)
-        lines = addPretext(lines, fileName, siteBaseURL, subURL)
+    except FileNotFoundError:
-    elif fileName=='base64.md':
+        pass
        lines = extract_base64_sections(page)
-    return lines
+    # Download remotely if not found locally
    try:
        if filename != "base64.md":
            url = CONFIG.github_raw_base + filename
        else:
            url = CONFIG.base64_rentry_url
-def cleanLineForSearchMatchChecks(line):
+        async with session.get(url, timeout=30) as resp:
-    siteBaseURL = "https://fmhy.net/"
+            resp.raise_for_status()
-    redditBaseURL = "https://www.reddit.com/r/FREEMEDIAHECKYEAH/wiki/"
+            content = await resp.text()
    return line.replace(redditBaseURL, '/').replace(siteBaseURL, '/')
-def alternativeWikiIndexing():
+            if filename == "base64.md":
-    wikiChunks = [
+                content = content.replace("\r", "")
-        dlWikiChunk("Video.md", "📺", "video"),
+                logger.info("Downloaded base64 page")
-        dlWikiChunk("AI.md", "🤖", "ai"),
+                return filename, process_base64_sections(content)
-        dlWikiChunk("Mobile.md", "📱", "mobile"),
+            else:
-        dlWikiChunk("Audio.md", "🎵", "audio"),
+                logger.info("Downloaded %s", filename)
-        dlWikiChunk("Downloading.md", "💾", "download"),
+                sub_url = filename.replace(".md", "").lower()
-        dlWikiChunk("Educational.md", "🧠", "educational"),
+                return filename, add_hierarchy_prefix(
-        dlWikiChunk("Gaming.md", "🎮", "gaming"),
+                    content.split("\n"), filename, sub_url
-        dlWikiChunk("privacy.md", "📛", "adblock-vpn-privacy"),
+                )
-        dlWikiChunk("System-Tools.md", "💻", "system-tools"),
+
-        dlWikiChunk("File-Tools.md", "🗃️", "file-tools"),
+    except Exception as e:
-        dlWikiChunk("Internet-Tools.md", "🔗", "internet-tools"),
+        logger.error("Failed to fetch %s (%s). Skipping.", filename, e)
-        dlWikiChunk("Social-Media-Tools.md", "💬", "social-media"),
+        return filename, []
        dlWikiChunk("Text-Tools.md", "📝", "text-tools"),
        dlWikiChunk("Video-Tools.md", "📼", "video-tools"),
        dlWikiChunk("MISC.md", "📂", "misc"),
        dlWikiChunk("Reading.md", "📗", "reading"),
        dlWikiChunk("Torrenting.md", "🌀", "torrent"),
        dlWikiChunk("image-tools.md", "📷", "img-tools"),
        dlWikiChunk("gaming-tools.md", "👾", "gaming-tools"),
        dlWikiChunk("linux-macos.md", "🐧🍏", "linux"),
        dlWikiChunk("developer-tools.md", "🖥️", "dev-tools"),
        dlWikiChunk("Non-English.md", "🌏", "non-eng"),
        dlWikiChunk("STORAGE.md", "🗄️", "storage"),
        #dlWikiChunk("base64.md", "🔑", "base64"),
        dlWikiChunk("NSFWPiracy.md", "🌶", "https://saidit.net/s/freemediafuckyeah/wiki/index")
    ]
    return [item for sublist in wikiChunks for item in sublist] #Flatten a <list of lists of strings> into a <list of strings>
 #--------------------------------
-# Save the result of alternativeWikiIndexing to a .md file
+async def collect_all_wiki_content_async() -> List[str]:
-# with open('wiki_adapted.md', 'w') as f:
+    """Collect and process all wiki sections concurrently."""
-#     for line in alternativeWikiIndexing():
+    async with aiohttp.ClientSession() as session:
-#         f.write(line + '\n')
+        tasks = []
        for section in WIKI_SECTIONS:
            task = download_wiki_content_async(session, section.filename)
            tasks.append(task)
-# Instead of saving it to a file, save it into a string variable
+        logger.info("Starting concurrent fetching of %d sections...", len(tasks))
-wiki_adapted_md = '\n'.join(alternativeWikiIndexing())
+        results = await asyncio.gather(*tasks, return_exceptions=True)
-# Remove from the lines in wiki_adapted_md any line that doesnt contain the character `⭐` or '🌟'
+        all_lines = []
-wiki_adapted_starred_only_md = '\n'.join([line for line in wiki_adapted_md.split('\n') if '⭐' in line or '🌟' in line])
+        for result in results:
            if isinstance(result, Exception):
                logger.error("Download task failed: %s", result)
                continue
            filename, lines = result
            all_lines.extend(lines)
        return all_lines
 # Wiki sections to process
 WIKI_SECTIONS = [
    WikiSection("video.md", "📺", "video"),
    WikiSection("ai.md", "🤖", "ai"),
    WikiSection("mobile.md", "📱", "mobile"),
    WikiSection("audio.md", "🎵", "audio"),
    WikiSection("downloading.md", "💾", "downloading"),
    WikiSection("educational.md", "🧠", "educational"),
    WikiSection("gaming.md", "🎮", "gaming"),
    WikiSection("privacy.md", "📛", "privacy"),
    WikiSection("system-tools.md", "💻", "system-tools"),
    WikiSection("file-tools.md", "🗃️", "file-tools"),
    WikiSection("internet-tools.md", "🔗", "internet-tools"),
    WikiSection("social-media-tools.md", "💬", "social-media-tools"),
    WikiSection("text-tools.md", "📝", "text-tools"),
    WikiSection("video-tools.md", "📼", "video-tools"),
    WikiSection("misc.md", "📂", "misc"),
    WikiSection("reading.md", "📗", "reading"),
    WikiSection("torrenting.md", "🌀", "torrenting"),
    WikiSection("image-tools.md", "📷", "image-tools"),
    WikiSection("gaming-tools.md", "👾", "gaming-tools"),
    WikiSection("linux-macos.md", "🐧🍏", "linux-macos"),
    WikiSection("developer-tools.md", "🖥️", "developer-tools"),
    WikiSection("non-english.md", "🌏", "non-english"),
    WikiSection("storage.md", "🗄️", "storage"),
    WikiSection("base64.md", "🔑", "base64"),
    WikiSection("unsafe.md", "🌶", "unsafe"),
 ]
 import re
-def markdown_to_html_bookmarks(input_md_text, output_file):
+async def main_async() -> None:
-    # Predefined folder name
+    """Main execution function (async version)."""
-    folder_name = "FMHY"
+    logger.info("Collecting wiki content...")
    all_content = await collect_all_wiki_content_async()
    full_content = "\n".join(all_content)
-    # Read the input markdown file
+    # Generate both bookmark files
-    #with open(input_file, 'r', encoding='utf-8') as f:
+    create_html_bookmarks(full_content, "fmhy_in_bookmarks.html")
-    #    markdown_content = f.read()
+    create_html_bookmarks(
        full_content, "fmhy_in_bookmarks_starred_only.html", starred_only=True
    )
-    # Instead of reading from a file, read from a string variable
+    logger.info("Bookmark generation complete!")
    markdown_content = input_md_text
-    # Regex pattern to extract URLs and titles from markdown
+
-    url_pattern = re.compile(r'\[([^\]]+)\]\((https?://[^\)]+)\)')
+def parse_bookmark_line(line: str) -> Tuple[str, str, str, BookmarkLine | None]:
-    # Regex pattern to extract hierarchy levels
+    """Parse a line to extract hierarchy and bookmark data."""
    url_pattern = re.compile(r"\[([^\]]+)\]\((https?://[^\)]+)\)")
    hierarchy_pattern = re.compile(r'^\{"([^"]+)", "([^"]+)", "([^"]+)"\}')
-    # Dictionary to hold bookmarks by hierarchy
+    hierarchy_match = hierarchy_pattern.match(line)
-    bookmarks = {}
+    if not hierarchy_match:
        return "", "", "", None
-    # Split the content by lines
+    level1, level2, level3 = hierarchy_match.groups()
-    lines = markdown_content.split('\n')
+    matches = url_pattern.findall(line)
-    # Parse each line
+    # Check if line contains starred content
-    for line in lines:
+    is_starred = "⭐" in line or "🌟" in line
-        # Find hierarchy levels
+
-        hierarchy_match = hierarchy_pattern.match(line)
+    # Extract raw description (text after last URL)
-        if not hierarchy_match:
+    last_paren = line.rfind(")")
    description_raw = (
        line[last_paren + 1 :].replace("**", "").strip() if last_paren != -1 else ""
    )
    bookmark_line = BookmarkLine(
        is_starred=is_starred, description_raw=description_raw, links=matches
    )
    return level1, level2, level3, bookmark_line
 def generate_bookmark_html(
    bookmarks_dict: Dict[str, Dict[str, Dict[str, List[BookmarkLine]]]],
    indent: int = 1,
    starred_only: bool = False,
    path: Tuple[str, ...] = (),
 ) -> str:
    """Generate HTML from bookmark dictionary."""
    html = ""
    for key, value in bookmarks_dict.items():
        html += "    " * indent + f"<DT><H3>{key}</H3>\n"
        html += "    " * indent + "<DL><p>\n"
        current_path = path + (key,)
        if isinstance(value, dict):
            html += generate_bookmark_html(
                value, indent + 1, starred_only, current_path
            )
        else:
            # At leaf level - render BookmarkLine items
            # current_path should be (level1, level2, level3)
            level1, level2, level3 = (
                current_path if len(current_path) >= 3 else ("", "", "")
            )
            for bookmark_line in value:
                # Skip if starred_only mode and line is not starred
                if starred_only and not bookmark_line.is_starred:
                    continue
                # Compute effective description
                if bookmark_line.description_raw:
                    effective_description = bookmark_line.description_raw
                else:
                    # Fallback description using current hierarchy path
                    effective_description = "- " + (
                        level3 if level3 != "/" else level2 if level2 else level1
                    )
                # Determine which links to render
                links_to_render = bookmark_line.links
                if starred_only:
                    links_to_render = links_to_render[
                        :1
                    ]  # Only first link for starred content
                # Render each link
                for title, url in links_to_render:
                    anchor_text = f"{title} {effective_description}".strip()
                    html += (
                        "    " * (indent + 1)
                        + f'<DT><A HREF="{url}" ADD_DATE="0">{anchor_text}</A>\n'
                    )
        html += "    " * indent + "</DL><p>\n"
    return html
 def create_html_bookmarks(
    content: str, output_file: str, starred_only: bool = False
 ) -> None:
    """Create HTML bookmark file from processed content."""
    bookmarks: Dict[str, Dict[str, Dict[str, List[BookmarkLine]]]] = {}
    for line in content.split("\n"):
        level1, level2, level3, bookmark_line = parse_bookmark_line(line)
        if (
            not level1 or bookmark_line is None
        ):  # Skip lines that don't match hierarchy pattern
            continue
-        level1, level2, level3 = hierarchy_match.groups()
+        # Initialize nested structure
        bookmarks.setdefault(level1, {}).setdefault(level2, {}).setdefault(level3, [])
        bookmarks[level1][level2][level3].append(bookmark_line)
-        # Initialize nested dictionaries for hierarchy levels
+    # Generate HTML
-        if level1 not in bookmarks:
+    html_content = (
-            bookmarks[level1] = {}
+        "<!DOCTYPE NETSCAPE-Bookmark-file-1>\n"
-        if level2 not in bookmarks[level1]:
+        '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">\n'
-            bookmarks[level1][level2] = {}
+        "<TITLE>Bookmarks</TITLE>\n"
-        if level3 not in bookmarks[level1][level2]:
+        "<H1>Bookmarks</H1>\n"
-            bookmarks[level1][level2][level3] = []
+        "<DL><p>\n"
        f"    <DT><H3>{CONFIG.folder_name}</H3>\n"
        "    <DL><p>\n"
        + generate_bookmark_html(bookmarks, indent=2, starred_only=starred_only)
        + "    </DL><p>\n"
        "</DL><p>\n"
    )
-        # Find all matches in the line for URLs
+    with open(output_file, "w", encoding="utf-8") as f:
        matches = url_pattern.findall(line)
        # If the input_md_text is wiki_adapted_starred_only_md, only add the first match of url_pattern in each line
        if input_md_text == wiki_adapted_starred_only_md:
            matches = matches[:1]
        # Extract the description (text after the last match)
        last_match_end = line.rfind(')')
        description = line[last_match_end+1:].replace('**', '').strip() if last_match_end != -1 else ''
        # When the description is empty, use as description the lowest hierachy level that is not empty
        if not description:
            description = '- ' + (level3 if level3 != '/' else level2 if level2 else level1)
        # Add matches to the appropriate hierarchy
        for title, url in matches:
            full_title = f"{title} {description}" if description else title
            bookmarks[level1][level2][level3].append((full_title, url))
    # Function to generate HTML from nested dictionary
    def generate_html(bookmarks_dict, indent=1):
        html = ''
        for key, value in bookmarks_dict.items():
            html += '    ' * indent + f'<DT><H3>{key}</H3>\n'
            html += '    ' * indent + '<DL><p>\n'
            if isinstance(value, dict):
                html += generate_html(value, indent + 1)
            else:
                for full_title, url in value:
                    html += '    ' * (indent + 1) + f'<DT><A HREF="{url}" ADD_DATE="0">{full_title}</A>\n'
            html += '    ' * indent + '</DL><p>\n'
        return html
    # HTML structure
    html_content = '''<!DOCTYPE NETSCAPE-Bookmark-file-1>
 <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
 <TITLE>Bookmarks</TITLE>
 <H1>Bookmarks</H1>
 <DL><p>
 '''
    # Add the main folder
    html_content += f'    <DT><H3>{folder_name}</H3>\n'
    html_content += '    <DL><p>\n'
    # Add bookmarks to HTML content
    html_content += generate_html(bookmarks)
    html_content += '    </DL><p>\n'
    html_content += '</DL><p>\n'
    # Write the HTML content to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)
-    # Print success message
+    logger.info("Created bookmark file: %s", output_file)
    #print(f'Successfully created bookmarks in {output_file}')
-# Example usage:
+
-markdown_to_html_bookmarks(wiki_adapted_md, 'fmhy_in_bookmarks.html')
+def main() -> None:
-markdown_to_html_bookmarks(wiki_adapted_starred_only_md, 'fmhy_in_bookmarks_starred_only.html')
+    """Main execution function."""
    asyncio.run(main_async())
 if __name__ == "__main__":
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1 @@
-requests
+aiohttp