This repository has been archived by the owner on Jul 8, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
run.sh
executable file
·90 lines (73 loc) · 2.03 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/bin/bash
if [ -n "$1" ]; then
TIMESTAMP=$1
else
TIMESTAMP=$(date "+%Y%m%d")
fi
echo "Starting processing for $TIMESTAMP"
BASE=`pwd`
DATA=$HOME/archive/urls/$TIMESTAMP
mkdir -p $DATA
cd $DATA
echo -e "Fetching Alexa Top 1M archive..."
if [ ! -f "top-1m.csv.zip" ]; then
wget -nv -N "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
if [ $? -ne 0 ]; then
echo "Alexa fetch failed, exiting."
exit
fi
else
echo -e "Alexa data already downloaded, skipping."
fi
if [ ! -f "content.rdf.u8.gz" ]; then
echo -e "Fetching DMOZ open directory RDF dump..."
wget -nv -N "http://rdf.dmoz.org/rdf/content.rdf.u8.gz"
if [ $? -ne 0 ]; then
echo "DMOZ fetch failed, exiting."
exit
fi
else
echo -e "DMOZ data already downloaded, skipping."
fi
echo -e "Cleaning and splitting host list."
mkdir -p tmp && cd tmp
zcat ../top-1m.csv.zip | cut -d, -f2 | split --lines=100000 - split-
cd $BASE
export GOPATH=~/hosts
ulimit -n 50000
go build
if [ ! -f "$DATA/hosts.json.gz" ]; then
# cremove old scan results, if present.
rm $DATA/hosts.json
for f in $DATA/tmp/split-*
do
echo "Processing $f host file..."
./hosts -workers=500 -output=$DATA/hosts.json < $f 2> /var/log/HA-host-crawl.log
if [ $? -ne 0 ]; then
echo "Host scanner failed, exiting."
exit
fi
done
echo -e "Compressing host scan results..."
pigz $DATA/hosts.json
else
echo -e "Host scanner finished, skipping."
fi
if [ ! -f "done" ]; then
echo "Starting data join process..."
ruby process.rb -a $DATA/top-1m.csv.zip -d $DATA/content.rdf.u8.gz -s $DATA/hosts.json.gz > $DATA/joined.json 2> /var/log/HA-host-join.log
if [ $? -ne 0 ]; then
echo "Data join failed, exiting."
exit
fi
else
echo -e "Data join finished, skipping."
fi
cd $DATA
echo -e "Syncing data to Google Storage..."
echo "" > "done"
gsutil cp -n * gs://httparchive/urls/${TIMESTAMP}/
echo -e "Kicking off Dataflow pipeline..."
cd $BASE
python dataflow.py --input gs://httparchive/urls/$TIMESTAMP/joined.json --output httparchive:urls.$TIMESTAMP
echo -e "Done."