mirror of
https://github.com/fmhy/bookmarks.git
synced 2026-03-11 08:55:39 +00:00
Refactor bookmark generator and use asynchronous events (#11)
This commit is contained in:
parent
78e65e8a01
commit
5defd48c97
2 changed files with 346 additions and 222 deletions
|
|
@ -1,269 +1,393 @@
|
||||||
import requests
|
"""Generate FMHY bookmark HTML files from FMHY markdown sections."""
|
||||||
|
|
||||||
def addPretext(lines, sectionName, baseURL, subURL):
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Config:
|
||||||
|
"""Configuration constants for the FMHY bookmark generator."""
|
||||||
|
|
||||||
|
site_base_url: str = "https://fmhy.net/"
|
||||||
|
reddit_base_url: str = "https://www.reddit.com/r/FREEMEDIAHECKYEAH/wiki/"
|
||||||
|
base64_rentry_url: str = "https://rentry.co/FMHYBase64/raw"
|
||||||
|
github_raw_base: str = (
|
||||||
|
"https://raw.githubusercontent.com/fmhy/edit/refs/heads/main/docs/"
|
||||||
|
)
|
||||||
|
folder_name: str = "FMHY"
|
||||||
|
decode_base64: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BookmarkLine:
|
||||||
|
"""Represents one original content line at a leaf."""
|
||||||
|
|
||||||
|
is_starred: bool # line contains ⭐ or 🌟
|
||||||
|
description_raw: str # raw trailing text after last ")", may be empty
|
||||||
|
links: List[Tuple[str, str]] # list of (title, url) exactly as matched
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WikiSection:
|
||||||
|
"""Represents a wiki section to be processed."""
|
||||||
|
|
||||||
|
filename: str
|
||||||
|
icon: str
|
||||||
|
url_key: str
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG = Config()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_heading(line: str, sub_url: str) -> Tuple[str, str]:
|
||||||
|
"""Parse heading line and return (subcategory, subsubcategory)."""
|
||||||
|
if sub_url != "storage":
|
||||||
|
if line.startswith("# ►"):
|
||||||
|
return line.replace("# ►", "").strip(), "/"
|
||||||
|
elif line.startswith("## ▷"):
|
||||||
|
return "", line.replace("## ▷", "").strip()
|
||||||
|
else: # storage section uses different heading levels
|
||||||
|
if line.startswith("## "):
|
||||||
|
return line.replace("## ", "").strip(), "/"
|
||||||
|
elif line.startswith("### "):
|
||||||
|
return "", line.replace("### ", "").strip()
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
|
||||||
|
def clean_category_name(category: str) -> str:
|
||||||
|
"""Remove URLs from category names."""
|
||||||
|
return "" if "http" in category else category
|
||||||
|
|
||||||
|
|
||||||
|
def add_hierarchy_prefix(
|
||||||
|
lines: List[str], section_name: str, sub_url: str
|
||||||
|
) -> List[str]:
|
||||||
|
"""Add hierarchy prefix to content lines."""
|
||||||
modified_lines = []
|
modified_lines = []
|
||||||
currMdSubheading = ""
|
curr_subcat = ""
|
||||||
currSubCat = ""
|
curr_subsubcat = ""
|
||||||
currSubSubCat = ""
|
|
||||||
|
|
||||||
#Remove from the lines any line that isnt a heading and doesnt contain the character `⭐`
|
|
||||||
#lines = [line for line in lines if line.startswith("#") or '⭐' in line]
|
|
||||||
|
|
||||||
#Parse headings
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if line.startswith("#"): #Title Lines
|
if line.startswith("#"): # Heading line
|
||||||
if not subURL=="storage":
|
subcat, subsubcat = parse_heading(line, sub_url)
|
||||||
if line.startswith("# ►"):
|
if subcat:
|
||||||
currMdSubheading = "#" + line.replace("# ►", "").strip().replace(" / ", "-").replace(" ", "-").lower()
|
curr_subcat = clean_category_name(subcat)
|
||||||
currSubCat = line.replace("# ►", "").strip()
|
if subsubcat:
|
||||||
currSubSubCat = "/"
|
curr_subsubcat = clean_category_name(subsubcat)
|
||||||
elif line.startswith("## ▷"):
|
elif any(char.isalpha() for char in line): # Content line
|
||||||
if not subURL=="non-english": #Because non-eng section has multiple subsubcats with same names
|
prefix = f'{{"{section_name.replace(".md", "")}", "{curr_subcat}", "{curr_subsubcat}"}}'
|
||||||
currMdSubheading = "#" + line.replace("## ▷", "").strip().replace(" / ", "-").replace(" ", "-").lower()
|
content = line[2:] if line.startswith("* ") else line
|
||||||
currSubSubCat = line.replace("## ▷", "").strip()
|
modified_lines.append(prefix + content)
|
||||||
elif subURL=="storage":
|
|
||||||
if line.startswith("## "):
|
|
||||||
currMdSubheading = "#" + line.replace("## ", "").strip().replace(" / ", "-").replace(" ", "-").lower()
|
|
||||||
currSubCat = line.replace("## ", "").strip()
|
|
||||||
currSubSubCat = "/"
|
|
||||||
elif line.startswith("### "):
|
|
||||||
currMdSubheading = "#" + line.replace("### ", "").strip().replace(" / ", "-").replace(" ", "-").lower()
|
|
||||||
currSubSubCat = line.replace("### ", "").strip()
|
|
||||||
|
|
||||||
# Remove links from subcategory titles (because the screw the format)
|
|
||||||
if 'http' in currSubCat: currSubCat = ''
|
|
||||||
if 'http' in currSubSubCat: currSubSubCat = ''
|
|
||||||
|
|
||||||
elif any(char.isalpha() for char in line): #If line has content
|
|
||||||
preText = f"{{\"{sectionName.replace(".md", "")}\", \"{currSubCat}\", \"{currSubSubCat}\"}}"
|
|
||||||
if line.startswith("* "): line = line[2:]
|
|
||||||
modified_lines.append(preText + line)
|
|
||||||
|
|
||||||
return modified_lines
|
return modified_lines
|
||||||
|
|
||||||
|
|
||||||
#----------------base64 page processing------------
|
# Base64 processing functions
|
||||||
import base64
|
def fix_base64_padding(encoded_string: str) -> str:
|
||||||
import re
|
"""Fix base64 padding."""
|
||||||
|
|
||||||
doBase64Decoding = True
|
|
||||||
|
|
||||||
def fix_base64_string(encoded_string):
|
|
||||||
missing_padding = len(encoded_string) % 4
|
missing_padding = len(encoded_string) % 4
|
||||||
if missing_padding != 0:
|
if missing_padding:
|
||||||
encoded_string += '=' * (4 - missing_padding)
|
encoded_string += "=" * (4 - missing_padding)
|
||||||
return encoded_string
|
return encoded_string
|
||||||
|
|
||||||
def decode_base64_in_backticks(input_string):
|
|
||||||
|
def decode_base64_content(input_string: str) -> str:
|
||||||
|
"""Decode base64 content within backticks."""
|
||||||
|
if not CONFIG.decode_base64:
|
||||||
|
return input_string
|
||||||
|
|
||||||
def base64_decode(match):
|
def base64_decode(match):
|
||||||
encoded_data = match.group(0)[1:-1] # Extract content within backticks
|
encoded_data = match.group(0)[1:-1] # Remove backticks
|
||||||
decoded_bytes = base64.b64decode( fix_base64_string(encoded_data) )
|
decoded_bytes = base64.b64decode(fix_base64_padding(encoded_data))
|
||||||
return decoded_bytes.decode()
|
return decoded_bytes.decode()
|
||||||
|
|
||||||
pattern = r"`[^`]+`" # Regex pattern to find substrings within backticks
|
pattern = r"`[^`]+`"
|
||||||
decoded_string = re.sub(pattern, base64_decode, input_string)
|
return re.sub(pattern, base64_decode, input_string)
|
||||||
return decoded_string
|
|
||||||
|
|
||||||
def remove_empty_lines(text):
|
|
||||||
lines = text.split('\n') # Split the text into lines
|
|
||||||
non_empty_lines = [line for line in lines if line.strip()] # Filter out empty lines
|
|
||||||
return '\n'.join(non_empty_lines) # Join non-empty lines back together
|
|
||||||
|
|
||||||
def extract_base64_sections(base64_page):
|
def process_base64_sections(base64_page: str) -> List[str]:
|
||||||
sections = base64_page.split("***") # Split the input string by "***" to get sections
|
"""Process base64 page sections."""
|
||||||
|
sections = base64_page.split("***")
|
||||||
formatted_sections = []
|
formatted_sections = []
|
||||||
|
|
||||||
for section in sections:
|
for section in sections:
|
||||||
formatted_section = remove_empty_lines( section.strip().replace("#### ", "").replace("\n\n", " - ").replace("\n", ", ") )
|
# Clean up section formatting
|
||||||
if doBase64Decoding: formatted_section = decode_base64_in_backticks(formatted_section)
|
clean_section = (
|
||||||
formatted_section = '[🔑Base64](https://rentry.co/FMHYBase64) ► ' + formatted_section
|
section.strip()
|
||||||
|
.replace("#### ", "")
|
||||||
|
.replace("\n\n", " - ")
|
||||||
|
.replace("\n", ", ")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove empty lines
|
||||||
|
lines = [line for line in clean_section.split("\n") if line.strip()]
|
||||||
|
clean_section = "\n".join(lines)
|
||||||
|
|
||||||
|
# Decode base64 if enabled
|
||||||
|
clean_section = decode_base64_content(clean_section)
|
||||||
|
|
||||||
|
# Add base64 prefix
|
||||||
|
formatted_section = (
|
||||||
|
"[🔑Base64](https://rentry.co/FMHYBase64) ► " + clean_section
|
||||||
|
)
|
||||||
formatted_sections.append(formatted_section)
|
formatted_sections.append(formatted_section)
|
||||||
lines = formatted_sections
|
|
||||||
return lines
|
return formatted_sections
|
||||||
#----------------</end>base64 page processing------------
|
|
||||||
|
|
||||||
|
|
||||||
|
async def download_wiki_content_async(
|
||||||
def dlWikiChunk(fileName, icon, redditSubURL):
|
session: aiohttp.ClientSession, filename: str
|
||||||
|
) -> Tuple[str, List[str]]:
|
||||||
#first, try to get the chunk locally
|
"""Download and process wiki content asynchronously."""
|
||||||
|
# First try to load locally
|
||||||
try:
|
try:
|
||||||
#First, try to get it from the local file
|
with open(filename, "r", encoding="utf-8") as f:
|
||||||
print("Loading " + fileName + " from local file...")
|
content = f.read()
|
||||||
with open(fileName.lower(), 'r') as f:
|
logger.info("Loaded %s locally", filename)
|
||||||
page = f.read()
|
|
||||||
print("Loaded.\n")
|
|
||||||
#if not available locally, download the chunk
|
|
||||||
except:
|
|
||||||
if not fileName=='base64.md':
|
|
||||||
print("Local file not found. Downloading " + fileName + " from Github...")
|
|
||||||
page = requests.get("https://raw.githubusercontent.com/fmhy/FMHYedit/main/docs/" + fileName.lower()).text
|
|
||||||
elif fileName=='base64.md':
|
|
||||||
print("Local file not found. Downloading rentry.co/FMHYBase64...")
|
|
||||||
page = requests.get("https://rentry.co/FMHYBase64/raw").text.replace("\r", "")
|
|
||||||
print("Downloaded")
|
|
||||||
|
|
||||||
#add a pretext
|
if filename != "base64.md":
|
||||||
redditBaseURL = "https://www.reddit.com/r/FREEMEDIAHECKYEAH/wiki/"
|
sub_url = filename.replace(".md", "").lower()
|
||||||
siteBaseURL = "https://fmhy.net/"
|
return filename, add_hierarchy_prefix(
|
||||||
if not fileName=='base64.md':
|
content.split("\n"), filename, sub_url
|
||||||
pagesDevSiteSubURL = fileName.replace(".md", "").lower()
|
)
|
||||||
subURL = pagesDevSiteSubURL
|
else:
|
||||||
lines = page.split('\n')
|
return filename, process_base64_sections(content)
|
||||||
lines = addPretext(lines, fileName, siteBaseURL, subURL)
|
except FileNotFoundError:
|
||||||
elif fileName=='base64.md':
|
pass
|
||||||
lines = extract_base64_sections(page)
|
|
||||||
|
|
||||||
return lines
|
# Download remotely if not found locally
|
||||||
|
try:
|
||||||
|
if filename != "base64.md":
|
||||||
|
url = CONFIG.github_raw_base + filename
|
||||||
|
else:
|
||||||
|
url = CONFIG.base64_rentry_url
|
||||||
|
|
||||||
def cleanLineForSearchMatchChecks(line):
|
async with session.get(url, timeout=30) as resp:
|
||||||
siteBaseURL = "https://fmhy.net/"
|
resp.raise_for_status()
|
||||||
redditBaseURL = "https://www.reddit.com/r/FREEMEDIAHECKYEAH/wiki/"
|
content = await resp.text()
|
||||||
return line.replace(redditBaseURL, '/').replace(siteBaseURL, '/')
|
|
||||||
|
|
||||||
def alternativeWikiIndexing():
|
if filename == "base64.md":
|
||||||
wikiChunks = [
|
content = content.replace("\r", "")
|
||||||
dlWikiChunk("Video.md", "📺", "video"),
|
logger.info("Downloaded base64 page")
|
||||||
dlWikiChunk("AI.md", "🤖", "ai"),
|
return filename, process_base64_sections(content)
|
||||||
dlWikiChunk("Mobile.md", "📱", "mobile"),
|
else:
|
||||||
dlWikiChunk("Audio.md", "🎵", "audio"),
|
logger.info("Downloaded %s", filename)
|
||||||
dlWikiChunk("Downloading.md", "💾", "download"),
|
sub_url = filename.replace(".md", "").lower()
|
||||||
dlWikiChunk("Educational.md", "🧠", "educational"),
|
return filename, add_hierarchy_prefix(
|
||||||
dlWikiChunk("Gaming.md", "🎮", "gaming"),
|
content.split("\n"), filename, sub_url
|
||||||
dlWikiChunk("privacy.md", "📛", "adblock-vpn-privacy"),
|
)
|
||||||
dlWikiChunk("System-Tools.md", "💻", "system-tools"),
|
|
||||||
dlWikiChunk("File-Tools.md", "🗃️", "file-tools"),
|
except Exception as e:
|
||||||
dlWikiChunk("Internet-Tools.md", "🔗", "internet-tools"),
|
logger.error("Failed to fetch %s (%s). Skipping.", filename, e)
|
||||||
dlWikiChunk("Social-Media-Tools.md", "💬", "social-media"),
|
return filename, []
|
||||||
dlWikiChunk("Text-Tools.md", "📝", "text-tools"),
|
|
||||||
dlWikiChunk("Video-Tools.md", "📼", "video-tools"),
|
|
||||||
dlWikiChunk("MISC.md", "📂", "misc"),
|
|
||||||
dlWikiChunk("Reading.md", "📗", "reading"),
|
|
||||||
dlWikiChunk("Torrenting.md", "🌀", "torrent"),
|
|
||||||
dlWikiChunk("image-tools.md", "📷", "img-tools"),
|
|
||||||
dlWikiChunk("gaming-tools.md", "👾", "gaming-tools"),
|
|
||||||
dlWikiChunk("linux-macos.md", "🐧🍏", "linux"),
|
|
||||||
dlWikiChunk("developer-tools.md", "🖥️", "dev-tools"),
|
|
||||||
dlWikiChunk("Non-English.md", "🌏", "non-eng"),
|
|
||||||
dlWikiChunk("STORAGE.md", "🗄️", "storage"),
|
|
||||||
#dlWikiChunk("base64.md", "🔑", "base64"),
|
|
||||||
dlWikiChunk("NSFWPiracy.md", "🌶", "https://saidit.net/s/freemediafuckyeah/wiki/index")
|
|
||||||
]
|
|
||||||
return [item for sublist in wikiChunks for item in sublist] #Flatten a <list of lists of strings> into a <list of strings>
|
|
||||||
#--------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
# Save the result of alternativeWikiIndexing to a .md file
|
async def collect_all_wiki_content_async() -> List[str]:
|
||||||
# with open('wiki_adapted.md', 'w') as f:
|
"""Collect and process all wiki sections concurrently."""
|
||||||
# for line in alternativeWikiIndexing():
|
async with aiohttp.ClientSession() as session:
|
||||||
# f.write(line + '\n')
|
tasks = []
|
||||||
|
for section in WIKI_SECTIONS:
|
||||||
|
task = download_wiki_content_async(session, section.filename)
|
||||||
|
tasks.append(task)
|
||||||
|
|
||||||
# Instead of saving it to a file, save it into a string variable
|
logger.info("Starting concurrent fetching of %d sections...", len(tasks))
|
||||||
wiki_adapted_md = '\n'.join(alternativeWikiIndexing())
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
# Remove from the lines in wiki_adapted_md any line that doesnt contain the character `⭐` or '🌟'
|
all_lines = []
|
||||||
wiki_adapted_starred_only_md = '\n'.join([line for line in wiki_adapted_md.split('\n') if '⭐' in line or '🌟' in line])
|
for result in results:
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
logger.error("Download task failed: %s", result)
|
||||||
|
continue
|
||||||
|
filename, lines = result
|
||||||
|
all_lines.extend(lines)
|
||||||
|
|
||||||
|
return all_lines
|
||||||
|
|
||||||
|
|
||||||
|
# Wiki sections to process
|
||||||
|
WIKI_SECTIONS = [
|
||||||
|
WikiSection("video.md", "📺", "video"),
|
||||||
|
WikiSection("ai.md", "🤖", "ai"),
|
||||||
|
WikiSection("mobile.md", "📱", "mobile"),
|
||||||
|
WikiSection("audio.md", "🎵", "audio"),
|
||||||
|
WikiSection("downloading.md", "💾", "downloading"),
|
||||||
|
WikiSection("educational.md", "🧠", "educational"),
|
||||||
|
WikiSection("gaming.md", "🎮", "gaming"),
|
||||||
|
WikiSection("privacy.md", "📛", "privacy"),
|
||||||
|
WikiSection("system-tools.md", "💻", "system-tools"),
|
||||||
|
WikiSection("file-tools.md", "🗃️", "file-tools"),
|
||||||
|
WikiSection("internet-tools.md", "🔗", "internet-tools"),
|
||||||
|
WikiSection("social-media-tools.md", "💬", "social-media-tools"),
|
||||||
|
WikiSection("text-tools.md", "📝", "text-tools"),
|
||||||
|
WikiSection("video-tools.md", "📼", "video-tools"),
|
||||||
|
WikiSection("misc.md", "📂", "misc"),
|
||||||
|
WikiSection("reading.md", "📗", "reading"),
|
||||||
|
WikiSection("torrenting.md", "🌀", "torrenting"),
|
||||||
|
WikiSection("image-tools.md", "📷", "image-tools"),
|
||||||
|
WikiSection("gaming-tools.md", "👾", "gaming-tools"),
|
||||||
|
WikiSection("linux-macos.md", "🐧🍏", "linux-macos"),
|
||||||
|
WikiSection("developer-tools.md", "🖥️", "developer-tools"),
|
||||||
|
WikiSection("non-english.md", "🌏", "non-english"),
|
||||||
|
WikiSection("storage.md", "🗄️", "storage"),
|
||||||
|
WikiSection("base64.md", "🔑", "base64"),
|
||||||
|
WikiSection("unsafe.md", "🌶", "unsafe"),
|
||||||
|
]
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
def markdown_to_html_bookmarks(input_md_text, output_file):
|
async def main_async() -> None:
|
||||||
# Predefined folder name
|
"""Main execution function (async version)."""
|
||||||
folder_name = "FMHY"
|
logger.info("Collecting wiki content...")
|
||||||
|
all_content = await collect_all_wiki_content_async()
|
||||||
|
full_content = "\n".join(all_content)
|
||||||
|
|
||||||
# Read the input markdown file
|
# Generate both bookmark files
|
||||||
#with open(input_file, 'r', encoding='utf-8') as f:
|
create_html_bookmarks(full_content, "fmhy_in_bookmarks.html")
|
||||||
# markdown_content = f.read()
|
create_html_bookmarks(
|
||||||
|
full_content, "fmhy_in_bookmarks_starred_only.html", starred_only=True
|
||||||
|
)
|
||||||
|
|
||||||
# Instead of reading from a file, read from a string variable
|
logger.info("Bookmark generation complete!")
|
||||||
markdown_content = input_md_text
|
|
||||||
|
|
||||||
# Regex pattern to extract URLs and titles from markdown
|
|
||||||
url_pattern = re.compile(r'\[([^\]]+)\]\((https?://[^\)]+)\)')
|
def parse_bookmark_line(line: str) -> Tuple[str, str, str, BookmarkLine | None]:
|
||||||
# Regex pattern to extract hierarchy levels
|
"""Parse a line to extract hierarchy and bookmark data."""
|
||||||
|
url_pattern = re.compile(r"\[([^\]]+)\]\((https?://[^\)]+)\)")
|
||||||
hierarchy_pattern = re.compile(r'^\{"([^"]+)", "([^"]+)", "([^"]+)"\}')
|
hierarchy_pattern = re.compile(r'^\{"([^"]+)", "([^"]+)", "([^"]+)"\}')
|
||||||
|
|
||||||
# Dictionary to hold bookmarks by hierarchy
|
hierarchy_match = hierarchy_pattern.match(line)
|
||||||
bookmarks = {}
|
if not hierarchy_match:
|
||||||
|
return "", "", "", None
|
||||||
|
|
||||||
# Split the content by lines
|
level1, level2, level3 = hierarchy_match.groups()
|
||||||
lines = markdown_content.split('\n')
|
matches = url_pattern.findall(line)
|
||||||
|
|
||||||
# Parse each line
|
# Check if line contains starred content
|
||||||
for line in lines:
|
is_starred = "⭐" in line or "🌟" in line
|
||||||
# Find hierarchy levels
|
|
||||||
hierarchy_match = hierarchy_pattern.match(line)
|
# Extract raw description (text after last URL)
|
||||||
if not hierarchy_match:
|
last_paren = line.rfind(")")
|
||||||
|
description_raw = (
|
||||||
|
line[last_paren + 1 :].replace("**", "").strip() if last_paren != -1 else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
bookmark_line = BookmarkLine(
|
||||||
|
is_starred=is_starred, description_raw=description_raw, links=matches
|
||||||
|
)
|
||||||
|
|
||||||
|
return level1, level2, level3, bookmark_line
|
||||||
|
|
||||||
|
|
||||||
|
def generate_bookmark_html(
|
||||||
|
bookmarks_dict: Dict[str, Dict[str, Dict[str, List[BookmarkLine]]]],
|
||||||
|
indent: int = 1,
|
||||||
|
starred_only: bool = False,
|
||||||
|
path: Tuple[str, ...] = (),
|
||||||
|
) -> str:
|
||||||
|
"""Generate HTML from bookmark dictionary."""
|
||||||
|
html = ""
|
||||||
|
for key, value in bookmarks_dict.items():
|
||||||
|
html += " " * indent + f"<DT><H3>{key}</H3>\n"
|
||||||
|
html += " " * indent + "<DL><p>\n"
|
||||||
|
|
||||||
|
current_path = path + (key,)
|
||||||
|
|
||||||
|
if isinstance(value, dict):
|
||||||
|
html += generate_bookmark_html(
|
||||||
|
value, indent + 1, starred_only, current_path
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# At leaf level - render BookmarkLine items
|
||||||
|
# current_path should be (level1, level2, level3)
|
||||||
|
level1, level2, level3 = (
|
||||||
|
current_path if len(current_path) >= 3 else ("", "", "")
|
||||||
|
)
|
||||||
|
|
||||||
|
for bookmark_line in value:
|
||||||
|
# Skip if starred_only mode and line is not starred
|
||||||
|
if starred_only and not bookmark_line.is_starred:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compute effective description
|
||||||
|
if bookmark_line.description_raw:
|
||||||
|
effective_description = bookmark_line.description_raw
|
||||||
|
else:
|
||||||
|
# Fallback description using current hierarchy path
|
||||||
|
effective_description = "- " + (
|
||||||
|
level3 if level3 != "/" else level2 if level2 else level1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Determine which links to render
|
||||||
|
links_to_render = bookmark_line.links
|
||||||
|
if starred_only:
|
||||||
|
links_to_render = links_to_render[
|
||||||
|
:1
|
||||||
|
] # Only first link for starred content
|
||||||
|
|
||||||
|
# Render each link
|
||||||
|
for title, url in links_to_render:
|
||||||
|
anchor_text = f"{title} {effective_description}".strip()
|
||||||
|
html += (
|
||||||
|
" " * (indent + 1)
|
||||||
|
+ f'<DT><A HREF="{url}" ADD_DATE="0">{anchor_text}</A>\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
html += " " * indent + "</DL><p>\n"
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
def create_html_bookmarks(
|
||||||
|
content: str, output_file: str, starred_only: bool = False
|
||||||
|
) -> None:
|
||||||
|
"""Create HTML bookmark file from processed content."""
|
||||||
|
bookmarks: Dict[str, Dict[str, Dict[str, List[BookmarkLine]]]] = {}
|
||||||
|
|
||||||
|
for line in content.split("\n"):
|
||||||
|
level1, level2, level3, bookmark_line = parse_bookmark_line(line)
|
||||||
|
if (
|
||||||
|
not level1 or bookmark_line is None
|
||||||
|
): # Skip lines that don't match hierarchy pattern
|
||||||
continue
|
continue
|
||||||
|
|
||||||
level1, level2, level3 = hierarchy_match.groups()
|
# Initialize nested structure
|
||||||
|
bookmarks.setdefault(level1, {}).setdefault(level2, {}).setdefault(level3, [])
|
||||||
|
bookmarks[level1][level2][level3].append(bookmark_line)
|
||||||
|
|
||||||
# Initialize nested dictionaries for hierarchy levels
|
# Generate HTML
|
||||||
if level1 not in bookmarks:
|
html_content = (
|
||||||
bookmarks[level1] = {}
|
"<!DOCTYPE NETSCAPE-Bookmark-file-1>\n"
|
||||||
if level2 not in bookmarks[level1]:
|
'<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">\n'
|
||||||
bookmarks[level1][level2] = {}
|
"<TITLE>Bookmarks</TITLE>\n"
|
||||||
if level3 not in bookmarks[level1][level2]:
|
"<H1>Bookmarks</H1>\n"
|
||||||
bookmarks[level1][level2][level3] = []
|
"<DL><p>\n"
|
||||||
|
f" <DT><H3>{CONFIG.folder_name}</H3>\n"
|
||||||
|
" <DL><p>\n"
|
||||||
|
+ generate_bookmark_html(bookmarks, indent=2, starred_only=starred_only)
|
||||||
|
+ " </DL><p>\n"
|
||||||
|
"</DL><p>\n"
|
||||||
|
)
|
||||||
|
|
||||||
# Find all matches in the line for URLs
|
with open(output_file, "w", encoding="utf-8") as f:
|
||||||
matches = url_pattern.findall(line)
|
|
||||||
|
|
||||||
# If the input_md_text is wiki_adapted_starred_only_md, only add the first match of url_pattern in each line
|
|
||||||
if input_md_text == wiki_adapted_starred_only_md:
|
|
||||||
matches = matches[:1]
|
|
||||||
|
|
||||||
# Extract the description (text after the last match)
|
|
||||||
last_match_end = line.rfind(')')
|
|
||||||
description = line[last_match_end+1:].replace('**', '').strip() if last_match_end != -1 else ''
|
|
||||||
|
|
||||||
# When the description is empty, use as description the lowest hierachy level that is not empty
|
|
||||||
if not description:
|
|
||||||
description = '- ' + (level3 if level3 != '/' else level2 if level2 else level1)
|
|
||||||
|
|
||||||
# Add matches to the appropriate hierarchy
|
|
||||||
for title, url in matches:
|
|
||||||
full_title = f"{title} {description}" if description else title
|
|
||||||
bookmarks[level1][level2][level3].append((full_title, url))
|
|
||||||
|
|
||||||
# Function to generate HTML from nested dictionary
|
|
||||||
def generate_html(bookmarks_dict, indent=1):
|
|
||||||
html = ''
|
|
||||||
for key, value in bookmarks_dict.items():
|
|
||||||
html += ' ' * indent + f'<DT><H3>{key}</H3>\n'
|
|
||||||
html += ' ' * indent + '<DL><p>\n'
|
|
||||||
if isinstance(value, dict):
|
|
||||||
html += generate_html(value, indent + 1)
|
|
||||||
else:
|
|
||||||
for full_title, url in value:
|
|
||||||
html += ' ' * (indent + 1) + f'<DT><A HREF="{url}" ADD_DATE="0">{full_title}</A>\n'
|
|
||||||
html += ' ' * indent + '</DL><p>\n'
|
|
||||||
return html
|
|
||||||
|
|
||||||
# HTML structure
|
|
||||||
html_content = '''<!DOCTYPE NETSCAPE-Bookmark-file-1>
|
|
||||||
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
|
|
||||||
<TITLE>Bookmarks</TITLE>
|
|
||||||
<H1>Bookmarks</H1>
|
|
||||||
<DL><p>
|
|
||||||
'''
|
|
||||||
# Add the main folder
|
|
||||||
html_content += f' <DT><H3>{folder_name}</H3>\n'
|
|
||||||
html_content += ' <DL><p>\n'
|
|
||||||
|
|
||||||
# Add bookmarks to HTML content
|
|
||||||
html_content += generate_html(bookmarks)
|
|
||||||
|
|
||||||
html_content += ' </DL><p>\n'
|
|
||||||
html_content += '</DL><p>\n'
|
|
||||||
|
|
||||||
# Write the HTML content to the output file
|
|
||||||
with open(output_file, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(html_content)
|
f.write(html_content)
|
||||||
|
|
||||||
# Print success message
|
logger.info("Created bookmark file: %s", output_file)
|
||||||
#print(f'Successfully created bookmarks in {output_file}')
|
|
||||||
|
|
||||||
# Example usage:
|
|
||||||
markdown_to_html_bookmarks(wiki_adapted_md, 'fmhy_in_bookmarks.html')
|
def main() -> None:
|
||||||
markdown_to_html_bookmarks(wiki_adapted_starred_only_md, 'fmhy_in_bookmarks_starred_only.html')
|
"""Main execution function."""
|
||||||
|
asyncio.run(main_async())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
|
||||||
|
|
@ -1 +1 @@
|
||||||
requests
|
aiohttp
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue