-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathparse_agcom
executable file
·141 lines (110 loc) · 3.39 KB
/
parse_agcom
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/perl
# vim: shiftwidth=4 tabstop=4
#
# Copyright 2014 by Seeweb s.r.l.
# Written by Marco d'Itri <[email protected]>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
use warnings;
use strict;
use autodie;
use Regexp::Common 2013031201 qw(net URI);
use List::MoreUtils qw(uniq);
use File::Slurp;
# Set this to true if your domain censorship method automatically censors
# all the subdomains of a censored domain.
my $Remove_Subdomains = 1;
##############################################################################
die "Usage: $0 TXT_INPUT DOMAINS_OUTPUT IPS_OUTPUT\n"
if @ARGV != 3;
my $lists = parse_agcom_file($ARGV[0]);
die "Censoring URLs is not supported!\n"
if $lists->{urls} and @{ $lists->{urls} };
die "No domains found!\n"
if not $lists->{domains} or not @{ $lists->{domains} };
remove_subdomains($lists) if $Remove_Subdomains;
write_file($ARGV[1], { atomic => 1 },
map { "$_\n" } sort @{ $lists->{domains} });
write_file($ARGV[2], { atomic => 1 },
map { "$_\n" } sort @{ $lists->{ips} });
exit 0;
##############################################################################
sub parse_agcom_file {
my ($file) = @_;
my ($number, $date_y, $date_m, $date_d);
open(my $fh, '<', $file);
my (@ips, @domains, @urls);
my $state = 0;
while (<$fh>) {
s/^\s+//;
# these two regexps cannot be joined because the last line may lack
# the end of line character(s)
s/[\r\n]+$//;
s/\s*$//;
# skip empty lines
next if /^$/;
# look for a line containing an incremental number and a date
if ($state == 0) {
($number, $date_y, $date_m, $date_d) = /^
\s*
(\d+) # incremental number
[\s_]+
(20[12]\d) # year
[\.:-]+
([012]\d|3[01]) # day
[\.:-]+
(0\d|1[012]) # month
\s*
$/x;
# if found, then start looking for the list entries
$state = 1 if $number and $date_y and $date_m and $date_d;
next;
}
# normalize some domains which are written as URLs
s#^http://($RE{net}{IPv4}|$RE{net}{domain}{-nospace}{-rfc1101})/?$#$1#;
if (/^$RE{net}{IPv4}$/) {
push(@ips, $_);
next;
}
if (/^$RE{net}{domain}{-nospace}{-rfc1101}$/) {
push(@domains, lc $_);
next;
}
if (/^$RE{URI}{HTTP}{-scheme => 'https?'}$/) {
push(@urls, $_);
next;
}
warn "Invalid entry in $file: '$_'\n";
}
close $fh;
# die "$file has no header!\n"
# if not ($number and $date_y and $date_m and $date_d);
die "$file contains no data!\n" if not (@domains or @ips or @urls);
return {
number => $number + 0,
date => "$date_y-$date_m-$date_d",
ips => [ uniq @ips ],
domains => [ uniq @domains ],
urls => [ uniq @urls ],
};
}
##############################################################################
sub remove_subdomains {
my ($l) = @_;
return if not $l or not $l->{domains};
my $list = $l->{domains};
my %domains = map { $_ => undef } @$list;
for (my $i = 0; $i < @$list; $i++) {
my $domain = $list->[$i];
# remove each leading component of the domain and check if the new
# resulting domain is in our list
while ($domain =~ s/^.+?\.(.+\..+)$/$1/) {
delete $list->[$i] if exists $domains{$domain};
}
}
# remove the deleted domains from the list
$l->{domains} = [ grep { defined $_ } @{ $l->{domains} } ];
}