[mv3] Add support for regex-based extended filters

Related issues:
- https://github.com/uBlockOrigin/uBOL-home/issues/223
- https://github.com/uBlockOrigin/uAssets/issues/31261
This commit is contained in:
Raymond Hill 2025-12-25 12:34:21 -05:00
parent 8ad61208e7
commit 377cf9d86b
No known key found for this signature in database
GPG key ID: F5630CAE62A14316
5 changed files with 98 additions and 26 deletions

View file

@ -131,10 +131,7 @@
return -1;
};
const lookupHostname = (hostname, data) => {
const listref = binarySearch(data.hostnames, hostname);
if ( listref === -1 ) { return; }
const ilist = data.selectorListRefs[listref];
const selectorsFromListIndex = (data, ilist) => {
const list = JSON.parse(`[${data.selectorLists[ilist]}]`);
const { result } = data;
for ( const iselector of list ) {
@ -146,6 +143,21 @@
}
};
const lookupHostname = (hostname, data) => {
const listref = binarySearch(data.hostnames, hostname);
if ( listref !== -1 ) {
selectorsFromListIndex(data, data.selectorListRefs[listref]);
}
const { fromRegexes } = data;
for ( let i = 0, n = fromRegexes.length; i < n; i += 2 ) {
if ( typeof fromRegexes[i+0] === 'string' ) {
fromRegexes[i+0] = new RegExp(fromRegexes[i+0]);
}
if ( fromRegexes[i+0].test(hostname) === false ) { continue; }
selectorsFromListIndex(data, fromRegexes[i+1]);
}
};
const selectorsFromRuleset = async (realm, rulesetId, result) => {
const data = await localRead(`css.${realm}.${rulesetId}`);
if ( typeof data !== 'object' || data === null ) { return; }

View file

@ -111,6 +111,8 @@ const logProgress = text => {
process?.stdout?.write?.(text.length > 120 ? `${text.slice(0, 119)}` : `${text} `);
};
const isHnRegexOrPath = hn => hn.includes('/');
/******************************************************************************/
async function fetchText(url, cacheDir) {
@ -828,7 +830,24 @@ async function processCosmeticFilters(assetDetails, realm, mapin) {
// Collate all distinct selectors
const allSelectors = new Map();
const allHostnames = new Map();
const allRegexesOrPaths = new Map();
let hasEntities = false;
const storeHostnameSelectorPair = (hn, iSelector) => {
if ( isHnRegexOrPath(hn) ) {
if ( allRegexesOrPaths.has(hn) === false ) {
allRegexesOrPaths.set(hn, new Set());
}
allRegexesOrPaths.get(hn).add(iSelector);
} else {
if ( allHostnames.has(hn) === false ) {
allHostnames.set(hn, new Set());
}
allHostnames.get(hn).add(iSelector);
hasEntities ||= hn.endsWith('.*');
}
};
for ( const [ selector, details ] of mapin ) {
if ( details.rejected ) { continue; }
if ( allSelectors.has(selector) === false ) {
@ -837,30 +856,30 @@ async function processCosmeticFilters(assetDetails, realm, mapin) {
const iSelector = allSelectors.get(selector);
if ( details.matches ) {
for ( const hn of details.matches ) {
if ( allHostnames.has(hn) === false ) {
allHostnames.set(hn, new Set());
}
allHostnames.get(hn).add(iSelector);
hasEntities ||= hn.endsWith('.*');
storeHostnameSelectorPair(hn, iSelector);
}
}
if ( details.excludeMatches ) {
for ( const hn of details.excludeMatches ) {
if ( allHostnames.has(hn) === false ) {
allHostnames.set(hn, new Set());
}
allHostnames.get(hn).add(~iSelector);
hasEntities ||= hn.endsWith('.*');
storeHostnameSelectorPair(hn, ~iSelector);
}
}
}
const allSelectorLists = new Map();
for ( const [ hn, selectorSet ] of allHostnames ) {
const ilistFromSelectorSet = selectorSet => {
const list = JSON.stringify(Array.from(selectorSet).sort()).slice(1, -1);
if ( allSelectorLists.has(list) === false ) {
allSelectorLists.set(list, allSelectorLists.size);
}
allHostnames.set(hn, allSelectorLists.get(list));
return allSelectorLists.get(list);
};
for ( const [ hn, selectorSet ] of allHostnames ) {
allHostnames.set(hn, ilistFromSelectorSet(selectorSet));
}
for ( const [ regexOrPath, selectorSet ] of allRegexesOrPaths ) {
allRegexesOrPaths.set(regexOrPath, ilistFromSelectorSet(selectorSet));
}
const sortedHostnames = Array.from(allHostnames.keys()).toSorted((a, b) => {
@ -875,6 +894,10 @@ async function processCosmeticFilters(assetDetails, realm, mapin) {
selectorListRefs: sortedHostnames.map(a => allHostnames.get(a)),
hostnames: sortedHostnames,
hasEntities,
fromRegexes: Array.from(allRegexesOrPaths)
.filter(a => a[0].startsWith('/') && a[0].endsWith('/'))
.map(a => [ a[0].slice(1, -1), a[1] ])
.flat(),
});
writeFile(`${scriptletDir}/${realm}/${assetDetails.id}.json`, data);
@ -890,7 +913,7 @@ async function processCosmeticFilters(assetDetails, realm, mapin) {
log(`CSS-${realm}: ${allSelectors.size} distinct filters for ${allHostnames.size} distinct hostnames`);
return sortedHostnames.length;
return sortedHostnames.length + allRegexesOrPaths.size;
}
/******************************************************************************/

View file

@ -33,6 +33,7 @@ const worldTemplate = {
args: new Map(),
arglists: new Map(),
hostnames: new Map(),
regexesOrPaths: new Map(),
matches: new Set(),
hasEntities: false,
hasAncestors: false,
@ -126,6 +127,15 @@ export function compile(assetDetails, details) {
const arglistIndex = worldDetails.arglists.get(arglistKey);
if ( details.matches ) {
for ( const hn of details.matches ) {
if ( hn.includes('/') ) {
worldDetails.matches.clear();
worldDetails.matches.add('*');
if ( worldDetails.regexesOrPaths.has(hn) === false ) {
worldDetails.regexesOrPaths.set(hn, new Set());
}
worldDetails.regexesOrPaths.get(hn).add(arglistIndex);
continue;
}
const isEntity = hn.endsWith('.*') || hn.endsWith('.*>>');
worldDetails.hasEntities ||= isEntity;
const isAncestor = hn.endsWith('>>')
@ -147,6 +157,13 @@ export function compile(assetDetails, details) {
}
if ( details.excludeMatches ) {
for ( const hn of details.excludeMatches ) {
if ( hn.includes('/') ) {
if ( worldDetails.regexesOrPaths.has(hn) === false ) {
worldDetails.regexesOrPaths.set(hn, new Set());
}
worldDetails.regexesOrPaths.get(hn).add(~arglistIndex);
continue;
}
if ( worldDetails.hostnames.has(hn) === false ) {
worldDetails.hostnames.set(hn, new Set());
}
@ -172,9 +189,17 @@ export async function commit(rulesetId, path, writeFn) {
if ( d !== 0 ) { return d; }
return a[0] < b[0] ? -1 : 1;
}).map(a => ([ a[0], JSON.stringify(Array.from(a[1]).map(a => JSON.parse(a))).slice(1,-1)]));
let content = safeReplace(scriptletTemplate, /\$rulesetId\$/, rulesetId, 0);
content = safeReplace(content, 'self.$hasEntities$', 'true');
content = safeReplace(content, 'self.$hasAncestors$', 'true');
const scriptletFromRegexes = Array.from(worldDetails.regexesOrPaths)
.filter(a => a[0].startsWith('/') && a[0].endsWith('/'))
.map(a => [ a[0].slice(1, -1), JSON.stringify(Array.from(a[1])).slice(1,-1) ])
.flat();
let content = safeReplace(scriptletTemplate, 'self.$hasEntities$', JSON.stringify(worldDetails.hasEntities));
content = safeReplace(content, 'self.$hasAncestors$', JSON.stringify(worldDetails.hasAncestors));
content = safeReplace(content, 'self.$hasRegexes$', JSON.stringify(scriptletFromRegexes.length !== 0));
content = safeReplace(content,
'self.$scriptletFromRegexes$',
`/* ${worldDetails.regexesOrPaths.size} */ ${JSON.stringify(scriptletFromRegexes)}`
);
content = safeReplace(content,
'self.$scriptletHostnames$',
`/* ${hostnames.length} */ ${JSON.stringify(hostnames.map(a => a[0]))}`
@ -199,6 +224,7 @@ export async function commit(rulesetId, path, writeFn) {
'self.$scriptletCode$',
Array.from(allFunctions.values()).sort().join('\n\n')
);
content = safeReplace(content, /\$rulesetId\$/, rulesetId, 0);
writeFn(`${path}/${world.toLowerCase()}/${rulesetId}.js`, content);
stats[world] = Array.from(worldDetails.matches).sort();
}

View file

@ -46,8 +46,11 @@ const $scriptletArglistRefs$ = self.$scriptletArglistRefs$;
const $scriptletHostnames$ = self.$scriptletHostnames$;
const $scriptletFromRegexes$ = self.$scriptletFromRegexes$;
const $hasEntities$ = self.$hasEntities$;
const $hasAncestors$ = self.$hasAncestors$;
const $hasRegexes$ = self.$hasRegexes$;
/******************************************************************************/
@ -134,11 +137,9 @@ if ( $hasAncestors$ ) {
}
$scriptletHostnames$.length = 0;
if ( todoIndices.size === 0 ) { return; }
// Collect arglist references
const todo = new Set();
{
if ( todoIndices.size !== 0 ) {
const arglistRefs = $scriptletArglistRefs$.split(';');
for ( const i of todoIndices ) {
for ( const ref of JSON.parse(`[${arglistRefs[i]}]`) ) {
@ -146,6 +147,19 @@ const todo = new Set();
}
}
}
if ( $hasRegexes$ ) {
const { hns } = entries[0];
for ( let i = 0, n = $scriptletFromRegexes$.length; i < n; i += 2 ) {
const regex = new RegExp($scriptletFromRegexes$[i+0]);
for ( const hn of hns ) {
if ( regex.test(hn) === false ) { continue; }
for ( const ref of JSON.parse(`[${$scriptletFromRegexes$[i+1]}]`) ) {
todo.add(ref);
}
}
}
}
if ( todo.size === 0 ) { return; }
// Execute scriplets
{

View file

@ -112,7 +112,6 @@ function addExtendedToDNR(context, parser) {
for ( const { hn, not, bad } of parser.getExtFilterDomainIterator() ) {
if ( bad ) { continue; }
if ( exception ) { continue; }
if ( isRegexOrPath(hn) ) { continue; }
let details = context.scriptletFilters.get(argsToken);
if ( details === undefined ) {
context.scriptletFilters.set(argsToken, details = { args });
@ -226,8 +225,6 @@ function addExtendedToDNR(context, parser) {
for ( const { hn, not, bad } of parser.getExtFilterDomainIterator() ) {
if ( bad ) { continue; }
if ( not && exception ) { continue; }
// TODO: Support regex- and path-based entries
if ( isRegexOrPath(hn) ) { continue; }
if ( not || exception ) {
excludeMatches.push(hn);
} else if ( hn !== '*' ) {