get second-level domain from publicsuffixlist

When suggesting a sitename we try to find the "significant" part of the url. for www.google.com that would be google.com, but just keeping the two last parts (or removing the first one) fail too often. amazon.co.uk is one example. Further, each TLD has it's own policy here, so an algorithmic approch is bound to fail. https://publicsuffix.org/ tries to gather all possible SLD's. it might not be perfect, but better than what we have (hardcoding a couple like (com|edu| co).* The list is rather large, but with some clever(?) tricks we can get it down to an acceptable size: Going a bit crazy here. Browsers don't support gzip/deflate data yet (waiting for the Compression Streams API) and other compression schemes where reasonable libs are available simply don't cut it on the compression rate. in the mean time, png is lossless and deflate compression - exactly what we need :) So this patch pre-process theh PSL list for easy lookup (and removes a lot of reduntant text) and export the result as a json dictionary. this is then converted to png by imagemagick. The browser loads the image, we access the pixel values and end up with our desired json dict. Issue #69
ttyridal · Dec 14, 2021 · 31bcb3a · 31bcb3a
1 parent 73cdb21
commit 31bcb3a
Show file tree

Hide file tree

Showing 8 changed files with 309 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ scrypt-asm.js
 *.msi
 *.exe
 /libscrypt/crypto_scrypt-nosse-nommap.c
+publicsuffixlist/public_suffix_list.dat
diff --git a/ext/webextension/package-lock.json b/ext/webextension/package-lock.json
diff --git a/ext/webextension/package.json b/ext/webextension/package.json
@@ -7,12 +7,15 @@
         "unittest": "node --experimental-vm-modules node_modules/jest/bin/jest.js --silent=false src/*.test.js"
     },
     "type": "module",
-    "jest": { "transform": {} },
+    "jest": {
+        "transform": {}
+    },
     "devDependencies": {
+        "eslint": "^8.2.0",
         "jest": "^27.3.1",
         "jest-puppeteer": "^6.0.0",
         "jest-webextension-mock": "^3.7.19",
-        "puppeteer": "^11.0.0",
-        "eslint": "^8.2.0"
+        "pngjs3": "^6.0.1",
+        "puppeteer": "^11.0.0"
     }
 }
diff --git a/ext/webextension/src/lib/psllookup.js b/ext/webextension/src/lib/psllookup.js
@@ -0,0 +1,67 @@
+function loadImage(url) {
+    let img = new Image();
+    return new Promise(res=>{
+        img.onload = ()=>{
+            res(img);
+        }
+        img.src = url;
+    });
+}
+
+async function getPixels(url) {
+    let img = await loadImage(url);
+    let canvas = document.createElement('canvas');
+    canvas.height = img.height;
+    canvas.width = img.width;
+    let context = canvas.getContext('2d');
+    context.drawImage(img, 0, 0);
+    return context.getImageData(0, 0, img.width, img.height).data;
+}
+
+
+function pixeldata_to_json(pixeldata) {
+    pixeldata = pixeldata.filter((_,i)=> i%4 ==0);
+    const blob = new Blob([pixeldata], {type: 'text/plain; charset=utf-8'});
+    return blob.text();
+}
+
+export class PslLookup {
+    constructor(args) {
+        args = args || {};
+        args = Object.assign({tableLoader: getPixels, tableurl: "./psllookup.json.png"}, args);
+        this.psltable = args.tableLoader(args.tableurl)
+        .then(pixeldata_to_json)
+        .then(JSON.parse)
+        .catch(e=>{console.log("something is failing",e)});
+    }
+
+    async waitTableReady() {
+        let lut = await this.psltable;
+        this.psltable = lut;
+    }
+
+    getPublicDomain(url) {
+        let lut = this.psltable;
+        const parts = url.split('.').reverse();
+        let res = [];
+        let v;
+
+        for (v=0; v < parts.length; v++) {
+            const part = parts[v];
+            if (!lut) break;
+            if (part in lut) {
+                res.push(part);
+                lut = lut[part]
+            }
+            else if ('*' in lut) {
+                res.push(part);
+                lut = null;
+            } else
+                break;
+        }
+        if (v < parts.length)
+            res.push(parts[v]);
+
+        return res.reverse().join('.');
+    }
+}
diff --git a/ext/webextension/src/lib/psllookup.json.png b/ext/webextension/src/lib/psllookup.json.png
diff --git a/ext/webextension/src/lib/psllookup.test.js b/ext/webextension/src/lib/psllookup.test.js
@@ -0,0 +1,55 @@
+/* globals global */
+"use strict";
+import {it, expect} from '@jest/globals'
+import {PslLookup} from './psllookup.js'
+import fs from 'fs';
+import {PNG} from 'pngjs3'
+// import { sync as PNGSync } from 'pngjs3';
+import { URL } from 'url';
+
+function pngPixels(url) {
+    const url_abspath = new URL(url, import.meta.url).pathname;
+    const data = fs.readFileSync(url_abspath);
+
+    return new Promise(resolve=>{
+        new PNG().parse(data, function (error, data) {
+            resolve(data.data);
+        });
+    });
+}
+
+class MockBlob {
+    constructor(data/*, params*/) {
+        let txt = data.toString("utf8");
+        this.text = ()=>{return Promise.resolve(txt)};
+    }
+}
+global.Blob = MockBlob;
+
+it('gets the correct domain from url', async () => {
+
+
+    const psl = new PslLookup({tableLoader: pngPixels});
+    await psl.waitTableReady()
+    const getDomain = psl.getPublicDomain.bind(psl);
+
+    expect(getDomain('example.com')).toBe('example.com');
+    expect(getDomain('amazon.com')).toBe('amazon.com');
+    expect(getDomain('show.amazon.com')).toBe('amazon.com');
+    expect(getDomain('amazon.co.uk')).toBe('amazon.co.uk');
+    expect(getDomain('shop.amazon.co.uk')).toBe('amazon.co.uk');
+    expect(getDomain('tyridal.no')).toBe('tyridal.no');
+    expect(getDomain('digi.gitapp.si')).toBe('digi.gitapp.si');
+    expect(getDomain('www.tyridal.no')).toBe('tyridal.no');
+    expect(getDomain('torbjorn.tyridal.no')).toBe('tyridal.no');
+    expect(getDomain('wilson.no.eu.org')).toBe('wilson.no.eu.org');
+    expect(getDomain('xxx.wilson.no.eu.org')).toBe('wilson.no.eu.org');
+    expect(getDomain('weare.org.om')).toBe('weare.org.om');
+    expect(getDomain('rave.weare.org.om')).toBe('weare.org.om');
+    expect(getDomain('rave.blogspot.co.nz')).toBe('rave.blogspot.co.nz');
+    expect(getDomain('rave.blogspot.com')).toBe('rave.blogspot.com');
+    expect(getDomain('xx.rave.blogspot.co.nz')).toBe('rave.blogspot.co.nz');
+    expect(getDomain('xx.rave.blogspot.com')).toBe('rave.blogspot.com');
+    expect(getDomain('blogspot.com')).toBe('blogspot.com');
+
+});
diff --git a/publicsuffixlist/Makefile b/publicsuffixlist/Makefile
@@ -0,0 +1,7 @@
+../ext/webextension/src/lib/psllookup.json.png: public_suffix_list.dat
+	python psl_to_pgm.py public_suffix_list.dat | convert pgm:- -strip -define png:compression-filter=1 ../ext/webextension/src/lib/psllookup.json.png
+
+public_suffix_list.dat:
+	wget https://publicsuffix.org/list/public_suffix_list.dat
+
+
diff --git a/publicsuffixlist/psl_to_pgm.py b/publicsuffixlist/psl_to_pgm.py
@@ -0,0 +1,103 @@
+from __future__ import print_function
+from collections import *
+import sys
+import json
+
+tree = lambda: defaultdict(tree)
+
+def is_ascii(s): return all(ord(c) < 128 for c in s)
+
+def build_tree_from_psl(pslfilename):
+    domain_tree = tree()
+
+    for l in open(pslfilename):
+        l = l.strip()
+        if not l or l[0] == '/' or '.' not in l: continue
+        x = l.split('.')
+
+        if l[0]=='!': continue  ## deal with those later
+
+        x = x[::-1]
+        d = domain_tree
+        for q in x:
+            if not is_ascii(q):
+                q = "xn-"+q.encode('punycode').decode('ascii')
+            d = d[q]
+
+    return domain_tree
+
+
+# convert defaultdict to dict and replace empty dicts (leafs)
+# with single 0 value
+def walk(d, dst):
+    for k,v in d.items():
+        if v:
+            dst[k] = dict()
+            walk(v, dst[k])
+        else:
+            dst[k] = 0
+
+
+## convert bytearray s to P5 PGM image
+def pgmdump(s):
+    rows = int(len(s) / 4096) + 1
+    cols =  int(len(s)/rows) + 1
+    padding = rows*cols - len(s)
+
+    print("P5")
+    print(cols)
+    print(rows)
+    print(255)
+    print(s, end='')
+    print(" "*padding)
+
+
+table=dict()
+walk(build_tree_from_psl(sys.argv[1]), table)
+pgmdump(json.dumps(table).replace(' ',''))
+
+if len(sys.argv) < 3 or sys.argv[2] != "test":
+    sys.exit(0)
+
+def lookup(url, d):
+    urlparts = url.split('.')[::-1]
+
+    lut = table
+    res = []
+
+    it = iter(urlparts)
+
+    for part in it:
+        res.append(part)
+        if not lut:
+            break
+        elif part in lut:
+            lut = lut[part]
+        elif '*' in lut:
+            lut = 0
+        else:
+            break
+
+    return ".".join(res[::-1])
+
+for test in [
+        'example.com',
+        'amazon.com',
+        'show.amazon.com',
+        'amazon.co.uk',
+        'shop.amazon.co.uk',
+        'tyridal.no',
+        'digi.gitapp.si',
+        'www.tyridal.no',
+        'torbjorn.tyridal.no',
+        'wilson.no.eu.org',
+        'xxx.wilson.no.eu.org',
+        'weare.org.om',
+        'rave.weare.org.om',
+        'rave.blogspot.co.nz',
+        'rave.blogspot.com',
+        'xx.rave.blogspot.co.nz',
+        'xx.rave.blogspot.com',
+        'blogspot.com',
+        ]:
+    print(test, "->", lookup(test, table))
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,3 +11,4 @@ scrypt-asm.js @@
     *.msi
     *.exe
     /libscrypt/crypto_scrypt-nosse-nommap.c
+    publicsuffixlist/public_suffix_list.dat