Skip to content
This repository has been archived by the owner on Feb 3, 2024. It is now read-only.

Commit

Permalink
Merge pull request #258 from maarten-boot/development
Browse files Browse the repository at this point in the history
convert file with supported tld's to Dict
  • Loading branch information
DannyCork authored Jan 27, 2023
2 parents eceae65 + 7fd6d6b commit d42a7f5
Show file tree
Hide file tree
Showing 15 changed files with 3,793 additions and 918 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,4 @@ typescript
test.out
diff.out
tmp/
1
5 changes: 5 additions & 0 deletions DONE
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,8 @@ DONE

- add nic to default test group for makeTestdataAll.sh

- convert the list of tld to Dict
- allow override or change and adding new domains without needing a new version directly
- tested with existing testdomains, all reponses will now respond with the true tld not the one with a underscore

- add simple autodetect based on tld from IANA, try to use the .com patterns to se if we get someting usefull
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,15 @@ Raise an issue https://github.com/DannyCork/python-whois/issues/new
2023-01-18: sorrowless
* add an opportunity to specify maximum cache age

2023-01-25: maarten_boot
* convert the tld file to a Dict, we now no longer need a mappper for python keywords or second level domains.
* utf8 level domains also need no mapper anymore an can be added as is with a translation to xn--<something>
* added xn-- tlds for all known utf-8 domains we currently have
* we can now add new domains on the fly or change them: whois.mergeExternalDictWithRegex(aDictToOverride) see example testExtend.py

2023-01-27: maarten_boot
* add autodetect via iana tld file (this has only tld's)

## Support
* Python 3.x is supported.
* Python 2.x IS NOT supported.
2 changes: 1 addition & 1 deletion TODO
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
TODO

# pt is difficult it often gives no data, it works in aws frankfurt through
# pt is difficult it often gives no data, it works in aws frankfurt though
ERROR: output; missing nameserver 'ns1.dnscpanel.com.' for tld: pt
ERROR: output; missing nameserver 'ns2.dnscpanel.com.' for tld: pt
58 changes: 58 additions & 0 deletions analize_patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#! /usr/bin/env python3

import sys
import re
from typing import (
# Optional,
# List,
Dict,
)

# most likely we can now introduce trailing whitespace trim on all lines from whois,
# and simplefy trailing whitespace rules
# as \r is already gone now and that was the most disticnt line ending
# occasionally we need to detect \n\s+ for groups that belong together
# mostly with indented blocks of nameservers

# import whois
from whois.tld_regexpr import ZZ


def buildRegCollection(zz: Dict):
regCollection = {}
# get all regexes
for name in zz:
# print(name)
z = zz[name]
for key in z:
if key is None:
continue

if key.startswith("_"):
continue

if key in ["extend"]:
continue

if key not in regCollection:
regCollection[key] = {}

reg = z[key]
if reg is None:
continue

regCollection[key][reg] = None
if isinstance(reg, str):
regCollection[key][reg] = re.compile(reg, flags=re.IGNORECASE)

return regCollection


if __name__ == "__main__":
regCollection = buildRegCollection(ZZ)

for name in sorted(regCollection.keys()):
print(f"## {name}", file=sys.stderr)
for key in sorted(regCollection[name].keys()):
if key:
print(f"{name}: {key}")
101 changes: 101 additions & 0 deletions compare_known_tld.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#! /usr/bin/env python3

# clone https://github.com/jophy/iana_tld_list in ./tmp

import urllib.request

from tmp.iana_tld_list.iana import IANA

import whois
from whois._1_query import _do_whois_query

# allow verbose messages during testing (all on stderr)
verbose = False

# by default the all tld file will be refreshed ever 24 hours,
# but you can force a new download anytime also
forceDownloadTld = False

# do you want to overwrite the results file ?
overwrite = True

# do you want interactive questions if files will be re-written?
interactive = False

# if autoProcessAll is True: all tld's will be processed (initial run > 20 minutes)
autoProcessAll = False

with_test_original = True

dirName = "/tmp/iana_data"

i = IANA(
dirName=dirName,
verbose=verbose,
overwrite=overwrite,
interactive=interactive,
autoProcessAll=autoProcessAll,
forceDownloadTld=forceDownloadTld,
)

# ge python whois known tld's and second level domains
known = sorted(whois.validTlds())

# get iana data
URL = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
response = urllib.request.urlopen(URL)
data = response.read().decode("utf-8").lower()
dataList = sorted(data.splitlines())

# filter out known names and try to detect names not known by iana
for name in known:
if name in dataList:
continue
if "." in name:
continue
if name not in dataList:
print(f"{name} tld name from python_whois is not known in IANA list")
continue

dataList2 = []
for name in dataList:
if name in known:
continue
dataList2.append(name)

# Try to auto detect new domains via IANA and some known common regex lists like .com
found = {}
for tld in dataList2:
data, status = i.getInfoOnOneTld(tld)

xtest = data and ("whois" in data) and (data["whois"]) and (data["whois"] != "NULL")
if not xtest:
print(f"no whois info for tld: {tld} {data}")
continue

wh = data["whois"]
if wh.endswith(f".{tld}"):
dd = wh.split(".")[-2:]
else:
dd = ["meta", tld]

print(f"try: {tld}")
zz = _do_whois_query(
dd,
ignore_returncode=False,
server=wh,
)

pp = {"_server": wh, "extend": "com"}
aDictToTestOverride = {tld: pp}

whois.mergeExternalDictWithRegex(aDictToTestOverride)
try:
d = whois.query(".".join(dd))
if d:
print(d.__dict__)
if len(d.name_servers) > 0:
found[tld] = pp
print(f"## ZZ['{tld}'] = {found[tld]} # auto-detected via IANA tld")
except Exception as e:
print(e)
14 changes: 14 additions & 0 deletions convert_to_dict.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#! /usr/bin/env bash

FILE="whois/tld_regexpr.py"
FILE2="whois/tld_regexpr2.py"

cat "$FILE" |
perl -np -e '
# translate all tld to DICT and substitute for the real tld in case of _
s/^([a-z]+)_([a-z]+)\s+=/ZZ["$1.$2"] =/;
s/^([a-z]+)\s+=/ZZ["$1"] =/;
# if we refer to a tld also change _ to .
s/"extend":\s+"(\w+)_(\w+)"/"extend": "$1.$2"/;
' |
tee "$FILE2"
120 changes: 120 additions & 0 deletions testExtend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/python3
import whois

Verbose = True

"""
initial testing had errors for these
we DONT have xn--3ds443g 在线 (online)
we DONT have xn--45q11c 八卦 (gossip)
we DONT have xn--czru2d 商城 (mall)
we DONT have xn--fiq228c5hs 中文网 (website)
we DONT have xn--hxt814e 网店 (webshop)
"""


def t1(domain: str, text: str):
print(f"{text}: {domain}")
try:
d = whois.query(domain)
if d:
print(d.__dict__)
else:
print(d)
except Exception as e:
print(domain, e)


def xMain():
aDictToTestOverride = {
"si": { # changing a existing one
"domain_name": r"domain:\s+(.+)",
"status": r"status:\s+(.+)",
"registrar": r"registrar:\s+(.+)",
"name_servers": r"nameserver:\s*(.+)",
"creation_date": r"created:\s+(.+)",
"expiration_date": None,
"updated_date": None,
"registrant_country": None,
},
"mk": { # defining a non existant one, meanwhile this is now supported so the test is meaningless
"extend": "com",
"domain_name": r"domain:\s+(.+)",
"status": r"status:\s+(.+)",
"registrar": r"registrar:\s+(.+)",
"name_servers": r"nserver:\s*(.+)",
"creation_date": r"registered:\s+(.+)",
"expiration_date": r"expire:\s+(.+)",
"updated_date": r"changed:\s+(.+)",
"registrant_country": None,
"registrant": r"registrant:\s+(.+)",
},
}

domains = [
"google.si",
"google.mk",
]
for domain in domains:
t1(domain, "BEFORE")

whois.mergeExternalDictWithRegex(aDictToTestOverride)

for domain in domains:
t1(domain, "AFTER")


xMain()

"""
% Domain Information over Whois protocol
%
% Whoisd Server Version: 3.9.0
% Timestamp: Fri Nov 25 16:49:33 2022
domain: google.mk
registrant: UNET-R11
admin-c: UNET-C12
nsset: UNET-NS191
registrar: UNET-REG
registered: 13.05.2008 14:00:00
changed: 17.04.2014 12:50:32
expire: 13.05.2023
contact: UNET-R11
org: Google LLC
name: Google LLC
address: Amphiteatre Parkway 1600
address: Mountain View
address: 94043
address: US
phone: +1.6502530000
fax-no: +1.6502530000
e-mail: [email protected]
registrar: UNET-REG
created: 25.03.2014 11:48:02
changed: 29.09.2021 16:26:23
contact: UNET-C12
name: Mark Monitor Inc.
address: 3540 East Longwing Lane Suite 300
address: Meridian
address: 83646
address: US
phone: +1.2083895740
e-mail: [email protected]
registrar: UNET-REG
created: 25.03.2014 11:48:00
changed: 19.11.2019 16:47:01
nsset: UNET-NS191
nserver: ns2.google.com
nserver: ns1.google.com
tech-c: UNET-C12
registrar: UNET-REG
created: 17.04.2014 12:50:22
changed: 17.04.2014 21:02:14
"""
38 changes: 0 additions & 38 deletions testdata/meta.rk/input

This file was deleted.

6 changes: 0 additions & 6 deletions testdata/meta.rk/output

This file was deleted.

Loading

0 comments on commit d42a7f5

Please sign in to comment.