Browse Source

refactor: up to ~3x faster URL matcher (40ms to 14ms)

+ show match compilation errors in web tab console
+ reduce the main cache lifetime to 5 minutes
+ separate cache for the matcher as it may have thousands of rules and urls
+ separate caches for each type of match for 2x less memory consumption on keys
+ toggle batch just once per URL (previously per each script)
tophf 3 years ago
parent
commit
bbe22951d5

+ 1 - 4
src/background/utils/cache.js

@@ -2,10 +2,7 @@ import initCache from '@/common/cache';
 import { commands } from './message';
 
 const cache = initCache({
-  /* Keeping the data for one hour since chrome.storage.local is insanely slow in Chrome,
-     it can takes seconds to read it when injecting tabs with a big script/value, which delays
-     all other scripts in this tab and they will never be able to run at document-start. */
-  lifetime: 60 * 60 * 1000,
+  lifetime: 5 * 60 * 1000,
 });
 
 Object.assign(commands, {

+ 4 - 2
src/background/utils/db.js

@@ -7,7 +7,7 @@ import { ICON_PREFIX, INJECT_PAGE, INJECT_AUTO, TIMEOUT_WEEK } from '@/common/co
 import { deepSize, forEachEntry, forEachKey, forEachValue } from '@/common/object';
 import pluginEvents from '../plugin/events';
 import { getNameURI, parseMeta, newScript, getDefaultCustom } from './script';
-import { testScript, testBlacklist } from './tester';
+import { testScript, testBlacklist, testerBatch } from './tester';
 import { preInitialize } from './init';
 import { commands } from './message';
 import patchDB from './patch-db';
@@ -239,7 +239,8 @@ const retriedStorageKeys = {};
 /**
  * @desc Get scripts to be injected to page with specific URL.
  */
-export function getScriptsByURL(url, isTop) {
+export function getScriptsByURL(url, isTop, errors) {
+  testerBatch(errors || true);
   const allScripts = testBlacklist(url)
     ? []
     : store.scripts.filter(script => (
@@ -247,6 +248,7 @@ export function getScriptsByURL(url, isTop) {
       && (isTop || !(script.custom.noframes ?? script.meta.noframes))
       && testScript(url, script)
     ));
+  testerBatch();
   return getScriptEnv(allScripts);
 }
 

+ 2 - 2
src/background/utils/icon.js

@@ -1,5 +1,5 @@
 import { i18n, makeDataUri, noop } from '@/common';
-import { ICON_PREFIX, INJECTABLE_TAB_URL_RE } from '@/common/consts';
+import { BLACKLIST, ICON_PREFIX, INJECTABLE_TAB_URL_RE } from '@/common/consts';
 import { objectPick } from '@/common/object';
 import { postInitialize } from './init';
 import { commands, forEachTab } from './message';
@@ -78,7 +78,7 @@ hookOptions((changes) => {
   || (v = changes[KEY_BADGE_COLOR_BLOCKED]) && (badgeColorBlocked = v)) {
     jobs.push(updateBadgeColor);
   }
-  if ('blacklist' in changes) {
+  if (BLACKLIST in changes) {
     jobs.push(updateState);
   }
   if (jobs.length) {

+ 4 - 3
src/background/utils/preinject.js

@@ -271,8 +271,8 @@ function prepare(key, url, tabId, frameId, forceContent) {
  */
 async function prepareScripts(res, cacheKey, url, tabId, frameId, forceContent) {
   const errors = [];
-  const bag = await getScriptsByURL(url, !frameId);
-  const { envDelayed, [ENV_SCRIPTS]: scripts } = bag;
+  const bag = await getScriptsByURL(url, !frameId, errors);
+  const { envDelayed, disabledIds: ids, [ENV_SCRIPTS]: scripts } = bag;
   const isLate = forceContent != null;
   bag[FORCE_CONTENT] = forceContent; // used in prepareScript and isPageRealm
   const feedback = scripts.map(prepareScript, bag).filter(Boolean);
@@ -293,10 +293,11 @@ async function prepareScripts(res, cacheKey, url, tabId, frameId, forceContent)
       envKey, // InjectionFeedback cache key for envDelayed
     },
     hasMore: !!more, // tells content bridge to expect envDelayed
-    ids: bag.disabledIds, // content bridge adds the actually running ids and sends via SetPopup
+    ids, // content bridge adds the actually running ids and sends via SetPopup
     info: {
       ua,
     },
+    errors: errors.filter(err => !ids.includes(+err.slice(err.lastIndexOf('#') + 1))).join('\n'),
   });
   res[FEEDBACK] = feedback;
   res[CSAPI_REG] = contentScriptsAPI && !isLate && !xhrInject

+ 179 - 133
src/background/utils/tester.js

@@ -1,84 +1,119 @@
+import { getScriptPrettyUrl } from '@/common';
+import { BLACKLIST, BLACKLIST_ERRORS } from '@/common/consts';
+import initCache from '@/common/cache';
 import * as tld from '@/common/tld';
-import cache from './cache';
 import { postInitialize } from './init';
 import { commands } from './message';
 import { getOption, hookOptions } from './options';
+import storage from './storage';
 
 Object.assign(commands, {
   TestBlacklist: testBlacklist,
 });
 
-postInitialize.push(resetBlacklist);
-
-tld.initTLD(true);
-
+const matchAlways = () => 1;
+/**
+ * Using separate caches to avoid memory consumption for thousands of prefixed long urls
+ * TODO: switch `cache` to hubs internally and add a prefix parameter or accept an Array for key
+ */
+const cacheMat = initCache({ lifetime: 60 * 60e3 });
+const cacheInc = initCache({ lifetime: 60 * 60e3 });
+const cacheResultMat = initCache({ lifetime: 60e3 });
+const cacheResultInc = initCache({ lifetime: 60e3 });
 const RE_MATCH_PARTS = /(.*?):\/\/([^/]*)\/(.*)/;
-let blacklistRules = [];
-hookOptions((changes) => {
-  if ('blacklist' in changes) resetBlacklist(changes.blacklist || '');
-});
 const RE_HTTP_OR_HTTPS = /^https?$/i;
-
-/*
- Simple FIFO queue for the results of testBlacklist, cached separately from the main |cache|
- because the blacklist is updated only once in a while so its entries would be crowding
- the main cache and reducing its performance (objects with lots of keys are slow to access).
-
- We also don't need to auto-expire the entries after a timeout.
- The only limit we're concerned with is the overall memory used.
- The limit is specified in the amount of unicode characters (string length) for simplicity.
- Disregarding deduplication due to interning, the actual memory used is approximately twice as big:
- 2 * keyLength + objectStructureOverhead * objectCount
-*/
 const MAX_BL_CACHE_LENGTH = 100e3;
 let blCache = {};
 let blCacheSize = 0;
+let blacklistRules = [];
+let batchErrors;
 
-function testRules(url, rules, prefix, ruleBuilder) {
-  return rules.some(rule => {
-    const key = `${prefix}:${rule}`;
-    const matcher = cache.get(key) || cache.put(key, ruleBuilder(rule));
-    return matcher.test(url);
-  });
-}
+postInitialize.push(resetBlacklist);
+hookOptions((changes) => {
+  if (BLACKLIST in changes) {
+    const errors = resetBlacklist(changes[BLACKLIST] || []);
+    const res = errors.length ? errors : null;
+    storage.base.setOne(BLACKLIST_ERRORS, res);
+    if (res) throw res; // will be passed to the UI
+  }
+});
+tld.initTLD(true);
 
-/**
- * Test glob rules like `@include` and `@exclude`.
- */
-export function testGlob(url, rules) {
-  return testRules(url, rules, 're', autoReg);
+export function testerBatch(arr) {
+  cacheMat.batch(arr);
+  cacheInc.batch(arr);
+  cacheResultMat.batch(arr);
+  cacheResultInc.batch(arr);
+  batchErrors = Array.isArray(arr) && arr;
 }
 
 /**
- * Test match rules like `@match` and `@exclude_match`.
+ * As this code is *very* hot, we avoid calling functions or creating possibly big arrays
+ * or creating copies of thousands of keys by prefixing them in `cache`, thus we avoid pauses
+ * due to major GC. The speedup is ~3x (from ~40ms to ~14ms) on a 4GHz CPU
+ * with popular scripts that have lots of @match e.g. Handy Image.
  */
-export function testMatch(url, rules) {
-  return testRules(url, rules, 'match', matchTester);
-}
-
 export function testScript(url, script) {
-  cache.batch(true);
+  let matex1; // main @match / @exclude-match
+  let matex2; // custom @match / @exclude-match
+  let inex1; // main @include / @exclude
+  let inex2; // custom @include / @exclude
   const { custom, meta } = script;
-  const mat = mergeLists(custom.origMatch && meta.match, custom.match);
-  const inc = mergeLists(custom.origInclude && meta.include, custom.include);
-  const exc = mergeLists(custom.origExclude && meta.exclude, custom.exclude);
-  const excMat = mergeLists(custom.origExcludeMatch && meta.excludeMatch, custom.excludeMatch);
-  // match all if no @match or @include rule
-  let ok = !mat.length && !inc.length;
-  // @match
-  ok = ok || testMatch(url, mat);
-  // @include
-  ok = ok || testGlob(url, inc);
-  // @exclude-match
-  ok = ok && !testMatch(url, excMat);
-  // @exclude
-  ok = ok && !testGlob(url, exc);
-  cache.batch(false);
+  const len = (matex1 = custom.origMatch && meta.match || '').length
+    + (matex2 = custom.match || '').length
+    + (inex1 = custom.origInclude && meta.include || '').length
+    + (inex2 = custom.include || '').length;
+  const ok = (
+    // Ok if lists are empty or @match + @include apply
+    !len || testRules(url, script, matex1, matex2, inex1, inex2)
+  ) && !(
+    // and no excludes apply
+    ((matex1 = custom.origExcludeMatch && meta.excludeMatch || '').length
+      + (matex2 = custom.excludeMatch || '').length
+      + (inex1 = custom.origExclude && meta.exclude || '').length
+      + (inex2 = custom.exclude || '').length
+    ) && testRules(url, script, matex1, matex2, inex1, inex2)
+  );
   return ok;
 }
 
-function mergeLists(...args) {
-  return args.reduce((res, item) => (item ? res.concat(item) : res), []);
+function testRules(url, script, ...list) {
+  // TODO: combine all non-regex rules in one big smart regexp
+  // e.g. lots of `*://foo/*` can be combined into `^https?://(foo|bar|baz)/`
+  for (let i = 0, m, rules, builder, cache, urlResults, res; i < 4; i += 1) {
+    // [matches, matches, includes, includes], some items may be empty
+    if ((rules = list[i]).length) {
+      if (!cache) { // happens one time for 0 or 1 and another time for 2 or 3
+        if (i < 2) { // matches1, matches2
+          builder = matchTester;
+          cache = cacheMat;
+          urlResults = cacheResultMat;
+        } else { // includes1, includes2
+          builder = autoReg;
+          cache = cacheInc;
+          urlResults = cacheResultInc;
+        }
+        urlResults = urlResults.get(url) || urlResults.put(url, {});
+      }
+      for (const rule of rules) {
+        if ((res = urlResults[rule]) != null) {
+          return res;
+        }
+        if (!(m = cache.get(rule))) {
+          try {
+            m = builder(rule);
+            cache.put(rule, m);
+          } catch (err) {
+            if (batchErrors) batchErrors.push(`${err} - ${getScriptPrettyUrl(script)}`);
+          }
+        }
+        if (m && (urlResults[rule] = m.test(url))) {
+          return true;
+        }
+      }
+    }
+    if (i === 1) cache = false; // this will switch cache+builder for includes if they're non-empty
+  }
 }
 
 function str2RE(str) {
@@ -86,34 +121,25 @@ function str2RE(str) {
   return re;
 }
 
-function bindRE(re) {
-  return re.test.bind(re);
-}
-
 function autoReg(str) {
   // regexp mode: case-insensitive per GM documentation
   if (str.length > 1 && str[0] === '/' && str[str.length - 1] === '/') {
-    let re;
-    try { re = new RegExp(str.slice(1, -1), 'i'); } catch (e) { /* ignore */ }
-    return { test: re ? bindRE(re) : () => false };
+    return new RegExp(str.slice(1, -1), 'i');
   }
   // glob mode: case-insensitive to match GM4 & Tampermonkey bugged behavior
   const reStr = str2RE(str.toLowerCase());
-  if (tld.isReady() && str.includes('.tld/')) {
-    const reTldStr = reStr.replace('\\.tld/', '((?:\\.[-\\w]+)+)/');
-    return {
-      test: (tstr) => {
-        const matches = tstr.toLowerCase().match(reTldStr);
-        if (matches) {
-          const suffix = matches[1].slice(1);
-          if (tld.getPublicSuffix(suffix) === suffix) return true;
-        }
-        return false;
-      },
-    };
+  const reTldStr = reStr.replace('\\.tld/', '((?:\\.[-\\w]+)+)/');
+  if (reStr !== reTldStr) {
+    return { test: matchTld.bind([reTldStr]) };
   }
-  const re = new RegExp(`^${reStr}$`, 'i'); // String with wildcards
-  return { test: bindRE(re) };
+  // String with wildcards
+  return RegExp(`^${reStr}$`, 'i');
+}
+
+function matchTld(tstr) {
+  const matches = tstr.toLowerCase().match(this[0]);
+  const suffix = matches?.[1].slice(1);
+  return suffix && tld.getPublicSuffix(suffix) === suffix;
 }
 
 function matchScheme(rule, data) {
@@ -121,10 +147,7 @@ function matchScheme(rule, data) {
   if (rule === data) return 1;
   // * = http | https
   // support http*
-  if ([
-    '*',
-    'http*',
-  ].includes(rule) && RE_HTTP_OR_HTTPS.test(data)) return 1;
+  if ((rule === '*' || rule === 'http*') && RE_HTTP_OR_HTTPS.test(data)) return 1;
   return 0;
 }
 
@@ -133,7 +156,7 @@ const RE_STR_TLD = '((?:\\.[-\\w]+)+)';
 function hostMatcher(rule) {
   // * matches all
   if (rule === '*') {
-    return () => 1;
+    return matchAlways;
   }
   // *.example.com
   // www.google.*
@@ -151,20 +174,22 @@ function hostMatcher(rule) {
     suffix = RE_STR_TLD;
   }
   const re = new RegExp(`^${prefix}${str2RE(base)}${suffix}$`);
-  return (data) => {
-    // exact match, case-insensitive
-    data = data.toLowerCase();
-    if (ruleLC === data) return 1;
-    // full check
-    const matches = data.match(re);
-    if (matches) {
-      const [, tldStr] = matches;
-      if (!tldStr) return 1;
-      const tldSuffix = tldStr.slice(1);
-      return tld.getPublicSuffix(tldSuffix) === tldSuffix;
-    }
-    return 0;
-  };
+  return hostMatcherFunc.bind([ruleLC, re]);
+}
+
+function hostMatcherFunc(data) {
+  // exact match, case-insensitive
+  data = data.toLowerCase();
+  if (this[0] === data) return 1;
+  // full check
+  const matches = data.match(this[1]);
+  if (matches) {
+    const [, tldStr] = matches;
+    if (!tldStr) return 1;
+    const tldSuffix = tldStr.slice(1);
+    return tld.getPublicSuffix(tldSuffix) === tldSuffix;
+  }
+  return 0;
 }
 
 function pathMatcher(rule) {
@@ -176,78 +201,99 @@ function pathMatcher(rule) {
     if (iQuery < 0) strRe = `^${strRe}(?:[?#]|$)`;
     else strRe = `^${strRe}(?:#|$)`;
   }
-  return bindRE(new RegExp(strRe));
+  return RegExp(strRe);
 }
 
 function matchTester(rule) {
   let test;
   if (rule === '<all_urls>') {
-    test = () => true;
+    test = matchAlways;
   } else {
     const ruleParts = rule.match(RE_MATCH_PARTS);
     if (ruleParts) {
-      const matchHost = hostMatcher(ruleParts[2]);
-      const matchPath = pathMatcher(ruleParts[3]);
-      test = (url) => {
-        const parts = url.match(RE_MATCH_PARTS);
-        return !!ruleParts && !!parts
-          && matchScheme(ruleParts[1], parts[1])
-          && matchHost(parts[2])
-          && matchPath(parts[3]);
-      };
+      test = matchTesterFunc.bind([
+        ruleParts[1],
+        hostMatcher(ruleParts[2]),
+        pathMatcher(ruleParts[3]),
+      ]);
     } else {
-      // Ignore invalid match rules
-      test = () => false;
+      throw `Invalid @match ${rule}`;
     }
   }
   return { test };
 }
 
+function matchTesterFunc(url) {
+  const parts = url.match(RE_MATCH_PARTS);
+  return +!!(parts
+    && matchScheme(this[0], parts[1])
+    && this[1](parts[2])
+    && this[2].test(parts[3])
+  );
+}
+
 export function testBlacklist(url) {
   let res = blCache[url];
   if (res === undefined) {
-    const rule = blacklistRules.find(({ test }) => test(url));
+    const rule = blacklistRules.find(m => m.test(url));
     res = rule?.reject && rule.text;
     updateBlacklistCache(url, res || false);
   }
   return res;
 }
 
-export function resetBlacklist(list) {
-  cache.batch(true);
-  const rules = list == null ? getOption('blacklist') : list;
+export function resetBlacklist(rules = getOption(BLACKLIST)) {
+  const emplace = (cache, rule, builder) => cache.get(rule) || cache.put(rule, builder(rule));
+  const errors = [];
+  testerBatch(true);
   if (process.env.DEBUG) {
     console.info('Reset blacklist:', rules);
   }
   // XXX compatible with {Array} list in v2.6.1-
   blacklistRules = (Array.isArray(rules) ? rules : (rules || '').split('\n'))
-  .map((text) => {
-    text = text.trim();
-    if (!text || text.startsWith('#')) return null;
-    const mode = text.startsWith('@') && text.split(/\s/, 1)[0];
-    const rule = mode ? text.slice(mode.length + 1).trim() : text;
-    const reject = mode !== '@include' && mode !== '@match'; // @include and @match = whitelist
-    const { test } = mode === '@include' || mode === '@exclude' && autoReg(rule)
-      || !mode && !rule.includes('/') && matchTester(`*://${rule}/*`) // domain
-      || matchTester(rule); // @match and @exclude-match
-    return { reject, test, text };
-  })
-  .filter(Boolean);
+  .reduce((res, text) => {
+    try {
+      text = text.trim();
+      if (!text || text.startsWith('#')) return res;
+      const mode = text.startsWith('@') && text.split(/\s/, 1)[0];
+      const rule = mode ? text.slice(mode.length + 1).trim() : text;
+      const isInc = mode === '@include';
+      const m = (isInc || mode === '@exclude') && emplace(cacheInc, rule, autoReg)
+      || !mode && !rule.includes('/') && emplace(cacheMat, `*://${rule}/*`, matchTester) // domain
+      || emplace(cacheMat, rule, matchTester); // @match and @exclude-match
+      m.reject = !(mode === '@match' || isInc); // @include and @match = whitelist
+      m.text = text;
+      res.push(m);
+    } catch (err) {
+      errors.push(err);
+    }
+    return res;
+  }, []);
   blCache = {};
   blCacheSize = 0;
-  cache.batch(false);
+  testerBatch();
+  return errors;
 }
 
+/**
+ Simple FIFO queue for the results of testBlacklist, cached separately from the main |cache|
+ because the blacklist is updated only once in a while so its entries would be crowding
+ the main cache and reducing its performance (objects with lots of keys are slow to access).
+ We also don't need to auto-expire the entries after a timeout.
+ The only limit we're concerned with is the overall memory used.
+ The limit is specified in the amount of unicode characters (string length) for simplicity.
+ Disregarding deduplication due to interning, the actual memory used is approximately twice as big:
+ 2 * keyLength + objectStructureOverhead * objectCount
+*/
 function updateBlacklistCache(key, value) {
   blCache[key] = value;
   blCacheSize += key.length;
   if (blCacheSize > MAX_BL_CACHE_LENGTH) {
-    Object.keys(blCache)
-    .some((k) => {
-      blCacheSize -= k.length;
-      delete blCache[k];
-      // reduce the cache to 75% so that this function doesn't run too often.
-      return blCacheSize < MAX_BL_CACHE_LENGTH * 3 / 4;
-    });
+    for (const k in blCache) {
+      if (delete blCache[k] && (blCacheSize -= k.length) < MAX_BL_CACHE_LENGTH * 0.75) {
+        // Reduced the cache to 75% so that this function doesn't run too often
+        return;
+      }
+    }
   }
 }

+ 4 - 5
src/common/cache.js

@@ -15,6 +15,7 @@ export default function initCache({
   let batchStartTime;
   // eslint-disable-next-line no-return-assign
   const getNow = () => batchStarted && batchStartTime || (batchStartTime = performance.now());
+  const OVERRUN = 1000; // in ms, to reduce frequency of calling setTimeout
   const exports = {
     batch, get, getValues, pop, put, del, has, hit, destroy,
   };
@@ -80,12 +81,10 @@ export default function initCache({
       clearTimeout(timer);
     }
     minLifetime = lifetime;
-    timer = setTimeout(trim, lifetime);
+    timer = setTimeout(trim, lifetime + OVERRUN);
   }
   function trim() {
-    // next timer won't be able to run earlier than 10ms
-    // so we'll sweep the upcoming expired entries in this run
-    const now = performance.now() + 10;
+    const now = performance.now();
     let closestExpiry = Number.MAX_SAFE_INTEGER;
     // eslint-disable-next-line guard-for-in
     for (const key in cache) {
@@ -98,7 +97,7 @@ export default function initCache({
     }
     minLifetime = closestExpiry - now;
     timer = closestExpiry < Number.MAX_SAFE_INTEGER
-      ? setTimeout(trim, minLifetime)
+      ? setTimeout(trim, minLifetime + OVERRUN)
       : 0;
   }
 }

+ 2 - 0
src/common/consts.js

@@ -35,3 +35,5 @@ export const TIMEOUT_WEEK = 7 * 24 * 60 * 60 * 1000;
 export const extensionRoot = !process.env.IS_INJECTED && browser.runtime.getURL('/') || '';
 export const extensionOrigin = extensionRoot.slice(0, -1);
 export const ICON_PREFIX = `${extensionRoot}public/images/icon`;
+export const BLACKLIST = 'blacklist';
+export const BLACKLIST_ERRORS = `${BLACKLIST}Errors`;

+ 4 - 1
src/injected/content/inject.js

@@ -124,7 +124,10 @@ export function injectPageSandbox(contentId, webId) {
  * @param {boolean} isXml
  */
 export async function injectScripts(contentId, webId, data, isXml) {
-  const { hasMore, info } = data;
+  const { errors, hasMore, info } = data;
+  if (errors) {
+    logging.warn(errors);
+  }
   realms = {
     __proto__: null,
     [INJECT_CONTENT]: {

+ 9 - 2
src/types.d.ts

@@ -173,14 +173,20 @@ declare namespace VMScript {
   }
 }
 /**
- * Injection data sent to the content bridge
+ * Injection data sent to the content bridge when injection is disabled
  */
-declare interface VMInjection {
+declare interface VMInjectionDisabled {
   expose: string | false;
+}
+/**
+ * Injection data sent to the content bridge when injection is enabled
+ */
+declare interface VMInjection extends VMInjectionDisabled {
   scripts: VMInjection.Script[];
   injectInto: VMScriptInjectInto;
   injectPage: boolean;
   cache: StringMap;
+  errors: string[];
   feedId: {
     /** InjectionFeedback cache key for cleanup when getDataFF outruns GetInjected */
     cacheKey: string;
@@ -235,6 +241,7 @@ declare namespace VMInjection {
     dataKey: string;
     displayName: string;
     code: string;
+    injectInto: VMScriptInjectInto;
     metaStr: string;
     runAt?: 'start' | 'body' | 'end' | 'idle';
     values?: StringMap;