mirror of
https://github.com/slavfox/Cozette.git
synced 2026-03-11 08:54:33 +00:00
62 lines
2 KiB
Python
62 lines
2 KiB
Python
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set
|
|
from unicodedata import name as uniname, east_asian_width as eaw
|
|
|
|
UESCAPE = re.compile(r"\\[uU]([0-9A-Fa-f]{4,5})")
|
|
|
|
|
|
def scan_file_for_nonascii(path: Path) -> Set[int]:
|
|
with path.open() as f:
|
|
try:
|
|
src = f.read()
|
|
except UnicodeDecodeError:
|
|
return set()
|
|
nonascii = {codepoint for c in set(src) if (codepoint := ord(c)) > 127}
|
|
escapes = UESCAPE.finditer(src)
|
|
nonascii |= {int(esc.group(1), 16) for esc in escapes}
|
|
return nonascii
|
|
|
|
|
|
# noinspection PyShadowingBuiltins
|
|
def scan_for_codepoints(dir: Path) -> Dict[int, List[Path]]:
|
|
if dir.is_file():
|
|
return {cp: [dir] for cp in scan_file_for_nonascii(dir)}
|
|
non_ascii_codepoints: Dict[int, List[Path]] = {}
|
|
for path in dir.glob("**/*"):
|
|
if path.is_file():
|
|
for cp in scan_file_for_nonascii(path):
|
|
non_ascii_codepoints.setdefault(cp, []).append(path)
|
|
return non_ascii_codepoints
|
|
|
|
|
|
def print_codepoints_for_changelog(
|
|
codepoints: Dict[int, List[Path]], print_source_files=False, reverse=False
|
|
) -> None:
|
|
for cp in sorted(codepoints, reverse=reverse):
|
|
ch = chr(cp)
|
|
try:
|
|
name = uniname(ch).strip()
|
|
except ValueError:
|
|
name = ""
|
|
if name:
|
|
if "VARIATION SELECTOR" in name:
|
|
continue
|
|
name = " " + name
|
|
print(f"**{ch} U+{cp:04X}{name}** {eaw(ch)}", end="")
|
|
if print_source_files:
|
|
print(f": {' '.join(str(p) for p in codepoints[cp])}")
|
|
else:
|
|
print("\n", end="")
|
|
|
|
|
|
def find_missing_codepoints(
|
|
sfdpath: Path, codepoints: Dict[int, List[Path]]
|
|
) -> Dict[int, List[Path]]:
|
|
with sfdpath.open() as sfd:
|
|
defined = set(
|
|
int(line.split()[2])
|
|
for line in sfd.readlines()
|
|
if line.startswith("BDFChar: ")
|
|
)
|
|
return {cp: paths for cp, paths in codepoints.items() if cp not in defined}
|