parse_agcom

#!/usr/bin/perl
# vim: shiftwidth=4 tabstop=4
#
# Copyright 2014 by Seeweb s.r.l.
# Written by Marco d'Itri <md@Linux.IT>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

use warnings;
use strict;
use autodie;

use Regexp::Common 2013031201 qw(net URI);
use List::MoreUtils qw(uniq);
use File::Slurp;

# Set this to true if your domain censorship method automatically censors
# all the subdomains of a censored domain.
my $Remove_Subdomains = 1;

##############################################################################
die "Usage: $0 TXT_INPUT DOMAINS_OUTPUT IPS_OUTPUT\n"
	if @ARGV != 3;

my $lists = parse_agcom_file($ARGV[0]);

die "Censoring URLs is not supported!\n"
	if $lists->{urls} and @{ $lists->{urls} };

die "No domains found!\n"
	if not $lists->{domains} or not @{ $lists->{domains} };

remove_subdomains($lists) if $Remove_Subdomains;

write_file($ARGV[1], { atomic => 1 },
	map { "$_\n" } sort @{ $lists->{domains} });
write_file($ARGV[2], { atomic => 1 },
	map { "$_\n" } sort @{ $lists->{ips} });

exit 0;

##############################################################################
sub parse_agcom_file {
	my ($file) = @_;
	my ($number, $date_y, $date_m, $date_d);

	open(my $fh, '<', $file);

	my (@ips, @domains, @urls);
	my $state = 0;
	while (<$fh>) {
		s/^\s+//;
		# these two regexps cannot be joined because the last line may lack
		# the end of line character(s)
		s/[\r\n]+$//;
		s/\s*$//;

		# skip empty lines
		next if /^$/;

		# look for a line containing an incremental number and a date
		if ($state == 0) {
			($number, $date_y, $date_m, $date_d) = /^
				\s*
				(\d+)            # incremental number
				[\s_]+
				(20[12]\d)       # year
				[\.:-]+
				([012]\d|3[01])  # day
				[\.:-]+
				(0\d|1[012])     # month
				\s*
			$/x;

			# if found, then start looking for the list entries
			$state = 1 if $number and $date_y and $date_m and $date_d;
			next;
		}

		# normalize some domains which are written as URLs
		s#^http://($RE{net}{IPv4}|$RE{net}{domain}{-nospace}{-rfc1101})/?$#$1#;

		if (/^$RE{net}{IPv4}$/) {
			push(@ips, $_);
			next;
		}

		if (/^$RE{net}{domain}{-nospace}{-rfc1101}$/) {
			push(@domains, lc $_);
			next;
		}

		if (/^$RE{URI}{HTTP}{-scheme => 'https?'}$/) {
			push(@urls, $_);
			next;
		}

		warn "Invalid entry in $file: '$_'\n";
	}

	close $fh;

#	die "$file has no header!\n"
#		if not ($number and $date_y and $date_m and $date_d);

	die "$file contains no data!\n" if not (@domains or @ips or @urls);

	return {
		number	=> $number + 0,
		date	=> "$date_y-$date_m-$date_d",
		ips		=> [ uniq @ips ],
		domains	=> [ uniq @domains ],
		urls	=> [ uniq @urls ],
	};
}

##############################################################################
sub remove_subdomains {
	my ($l) = @_;
	return if not $l or not $l->{domains};

	my $list = $l->{domains};

	my %domains = map { $_ => undef } @$list;

	for (my $i = 0; $i < @$list; $i++) {
		my $domain = $list->[$i];
		# remove each leading component of the domain and check if the new
		# resulting domain is in our list
		while ($domain =~ s/^.+?\.(.+\..+)$/$1/) {
			delete $list->[$i] if exists $domains{$domain};
		}
	}

	# remove the deleted domains from the list
	$l->{domains} = [ grep { defined $_ } @{ $l->{domains} } ];
}