From 7fd6d6b4df47c6b2fd79093974e81b03859dcf4e Mon Sep 17 00:00:00 2001
From: ReversingLabs <55623149+rl-devops@users.noreply.github.com>
Date: Fri, 27 Jan 2023 12:24:40 +0100
Subject: [PATCH] add a simple pattern analizer for future use

---
 DONE                 |  2 ++
 analize_patterns.py  | 58 +++++++++++++++++++++++++++++++++++++++++++
 compare_known_tld.py | 59 ++++++++++++++++++++++----------------------
 3 files changed, 90 insertions(+), 29 deletions(-)
 create mode 100755 analize_patterns.py

diff --git a/DONE b/DONE
index 18ffddc..cee52e6 100644
--- a/DONE
+++ b/DONE
@@ -43,3 +43,5 @@ DONE
   -  convert the list of tld to Dict
   -  allow override or change and adding new domains without needing a new version directly
   -  tested with existing testdomains, all reponses will now respond with the true tld not the one with a underscore
+
+  - add simple autodetect based on tld from IANA, try to use the .com patterns to se if we get someting usefull
diff --git a/analize_patterns.py b/analize_patterns.py
new file mode 100755
index 0000000..111d686
--- /dev/null
+++ b/analize_patterns.py
@@ -0,0 +1,58 @@
+#! /usr/bin/env python3
+
+import sys
+import re
+from typing import (
+    # Optional,
+    # List,
+    Dict,
+)
+
+# most likely we can now introduce trailing whitespace trim on all lines from whois,
+# and simplefy trailing whitespace rules
+# as \r is already gone now and that was the most disticnt line ending
+# occasionally we need to detect \n\s+ for groups that belong together
+# mostly with indented blocks of nameservers
+
+# import whois
+from whois.tld_regexpr import ZZ
+
+
+def buildRegCollection(zz: Dict):
+    regCollection = {}
+    # get all regexes
+    for name in zz:
+        # print(name)
+        z = zz[name]
+        for key in z:
+            if key is None:
+                continue
+
+            if key.startswith("_"):
+                continue
+
+            if key in ["extend"]:
+                continue
+
+            if key not in regCollection:
+                regCollection[key] = {}
+
+            reg = z[key]
+            if reg is None:
+                continue
+
+            regCollection[key][reg] = None
+            if isinstance(reg, str):
+                regCollection[key][reg] = re.compile(reg, flags=re.IGNORECASE)
+
+    return regCollection
+
+
+if __name__ == "__main__":
+    regCollection = buildRegCollection(ZZ)
+
+    for name in sorted(regCollection.keys()):
+        print(f"## {name}", file=sys.stderr)
+        for key in sorted(regCollection[name].keys()):
+            if key:
+                print(f"{name}: {key}")
diff --git a/compare_known_tld.py b/compare_known_tld.py
index ca6a88c..e867bd4 100755
--- a/compare_known_tld.py
+++ b/compare_known_tld.py
@@ -67,34 +67,35 @@
 found = {}
 for tld in dataList2:
     data, status = i.getInfoOnOneTld(tld)
-    # print(status, data)
-
-    if data and "whois" in data and data["whois"] and data["whois"] != "NULL":
-        wh = data["whois"]
-        if wh.endswith(f".{tld}"):
-            dd = wh.split(".")[-2:]
-        else:
-            dd = ["meta", tld]
-
-        zz = _do_whois_query(
-            dd,
-            ignore_returncode=False,
-            server=wh,
-        )
-
-        pp = {"_server": wh, "extend": "com"}
-        aDictToTestOverride = {tld: pp}
-
-        whois.mergeExternalDictWithRegex(aDictToTestOverride)
-        try:
-            d = whois.query(".".join(dd))
-            if d:
-                print(d.__dict__)
-                if len(d.name_servers) > 0:
-                    found[tld] = pp
-                    print(f"## ZZ['{tld}'] = {found[tld]} # auto-detected via IANA tld")
-        except Exception as e:
-            print(e)
 
+    xtest = data and ("whois" in data) and (data["whois"]) and (data["whois"] != "NULL")
+    if not xtest:
+        print(f"no whois info for tld: {tld} {data}")
+        continue
+
+    wh = data["whois"]
+    if wh.endswith(f".{tld}"):
+        dd = wh.split(".")[-2:]
     else:
-        print(f"no whois info for tld: {tld}\n", data)
+        dd = ["meta", tld]
+
+    print(f"try: {tld}")
+    zz = _do_whois_query(
+        dd,
+        ignore_returncode=False,
+        server=wh,
+    )
+
+    pp = {"_server": wh, "extend": "com"}
+    aDictToTestOverride = {tld: pp}
+
+    whois.mergeExternalDictWithRegex(aDictToTestOverride)
+    try:
+        d = whois.query(".".join(dd))
+        if d:
+            print(d.__dict__)
+            if len(d.name_servers) > 0:
+                found[tld] = pp
+                print(f"## ZZ['{tld}'] = {found[tld]} # auto-detected via IANA tld")
+    except Exception as e:
+        print(e)