forked from ElektraInitiative/libelektra
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlink-checker
executable file
·103 lines (90 loc) · 1.99 KB
/
link-checker
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/sh
#
# @author Kurt Micheli <[email protected]>, Klemens Böswirth <[email protected]>
# @brief Checks http, https and ftp links if they are broken.
# @date 05.09.2016
# @tags validation
if [ -z "$1" ]; then
echo "Usage: link-checker <linkfile>"
exit
fi
SCRIPT_DIR="$(dirname "$0")"
LINKFILE=$(mktemp)
awk -F '|' '
# Ignore links that contain version placeholder
!/doc\/news\/_preparation_next_release\.md.*<<VERSION>>/ {
if($2 ~ /.*\//) {
sub("/$", "", $2)
}
sub("\\+", "%2B", $2)
print $2 "|" $1
}' "$1" | sort -t '|' -k1 | awk -F "|" '
key != $1 || NR == 1 {
key = $1
if(data) {
print data
}
data = $0
next
}
{
sub($1,"",$0)
data = data""$0
}
END {
print data
}' > "$LINKFILE"
WHITELIST=$(mktemp)
grep -Ev '^(#|[[:space:]]*$)' "$SCRIPT_DIR/../tests/linkchecker.whitelist" > "$WHITELIST"
NUMTHREADS=10
TIMEOUT=20
TRIES=5
check() {
link=$(echo "$1" | grep -oE "(https|http|ftp):[^|]*")
http_link=$(echo "$1" | grep -oE "http:[^|]*")
files=$(echo "$1" | grep -oE "\|.*" | sed 's/|/ /g')
if echo "$link" | grep -Eqf "$WHITELIST"; then
echo "whitelisted: $link"
printf "%i/%i\r" "$COUNTLINKS" "$NUMLINKS"
return
fi
if [ -z "$link" ]; then
echo "check the link format of $1"
return
fi
if [ -n "$http_link" ]; then
echo "link $1 is an http link but not whitelisted"
return
fi
wget --spider --quiet --timeout=$TIMEOUT --tries=$TRIES "$link"
if [ "$?" -ne "0" ]; then
wget -O - --quiet --timeout=$TIMEOUT --tries=$TRIES "$link" > /dev/null
if [ "$?" -ne "0" ]; then
for file in $files; do
echo >&2 "$file $link"
done
fi
fi
}
COUNTLINKS=0
THREADCOUNT=0
NUMLINKS=$(wc -l < "$LINKFILE")
PIDS=""
while read -r line; do
check "$line" &
PIDS="$PIDS $!"
THREADCOUNT=$((THREADCOUNT + 1))
COUNTLINKS=$((COUNTLINKS + 1))
printf "%i/%i\r" "$COUNTLINKS" "$NUMLINKS"
if [ "$THREADCOUNT" -eq "$NUMTHREADS" ]; then
for pid in $PIDS; do
wait "$pid"
done
PIDS=""
THREADCOUNT=1
fi
done < "$LINKFILE"
for pid in $PIDS; do
wait "$pid"
done
echo ""