-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit.ml
153 lines (144 loc) · 3.7 KB
/
reddit.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
(* Reddit *)
open Nethtml
open Scan
open Print
open Pffpsf
type entry =
{
mutable en_url : string;
mutable en_title : string;
mutable en_user : string;
mutable en_id : string option;
mutable en_comments : int option;
mutable en_score : int option
}
type details =
{
mutable de_up_votes : int;
mutable de_down_votes : int;
mutable de_comments : int;
}
let print_entry oc en =
fp oc "URL: %S\n" en.en_url;
fp oc "Title: %S\n" en.en_title;
fp oc "User: %S\n" en.en_user;
fp oc "Comments: %a\n" (print_option print_int) en.en_comments;
fp oc "ID: %a\n" (print_option print_string) en.en_id;
fp oc "Score: %a\n" (print_option print_int) en.en_score
let score_rex = Pcre.regexp "^([0-9]+) "
let id_from_comments_rex = Pcre.regexp "info/([^/]+)/comments"
let num_comments_rex = Pcre.regexp "\\b([0-9]+) *comment"
let user_rex = Pcre.regexp "^/user/"
let process_front doc =
let entries = ref [] in
on_matching
(has_class "entry")
(fun e ->
let en =
{
en_url = "";
en_user = "";
en_title = "";
en_id = None;
en_comments = None;
en_score = None
}
in
(*pf "Entry: %a\n" Www.dump_document e;*)
on_matching
(element "a" &&& has_class "title")
(fun e ->
with_attribute "href" (fun u -> en.en_url <- u) () e;
visit (on_data (fun u -> en.en_title <- u)) e;
)
e;
on_matching
(element "a" &&& has_class "bylink")
(fun e ->
with_attribute "href" (fun u ->
begin
try
en.en_id <- Some(Pcre.get_substring (Pcre.exec ~rex:id_from_comments_rex u) 1)
with
| Not_found -> ()
end) () e;
visit
(on_data
(fun u ->
try
en.en_comments <- Some(int_of_string (Pcre.get_substring (Pcre.exec ~rex:num_comments_rex u) 1))
with
| _ -> ()
)
)
e
)
e;
on_matching
(element "div" &&& has_class "little")
(on_matching
(element "a" &&&
(with_attribute "href"
(Pcre.pmatch ~rex:user_rex)
false))
(visit (on_data (fun u -> en.en_user <- u))))
e;
on_matching
(element "span" &&& has_class "inside")
(fun e ->
visit
(on_data
(fun u ->
try
en.en_score <- Some(int_of_string (Pcre.get_substring (Pcre.exec ~rex:score_rex u) 1))
with
| _ -> ())
)
e
)
e;
entries := en :: !entries
)
doc;
Array.of_list (List.rev !entries)
let process_details doc =
let de =
{
de_up_votes = 0;
de_down_votes = 0;
de_comments = 0;
}
in
let extractor name f =
(on_matching
(element "tr" &&& has (data_matches ((=) name)))
(on_matching
(element "td" &&& !!! (has_class "profline"))
(visit (on_data (fun u -> f (int_of_string u))))
)
)
in
on_matching
(has_class "details")
(fun e ->
extractor "up votes" (fun x -> de.de_up_votes <- x) e;
extractor "down votes" (fun x -> de.de_down_votes <- x) e;
)
doc;
on_matching
(has_class "entry")
(on_matching
(element "a" &&& has_class "bylink")
(visit
(on_data
(fun u ->
try
de.de_comments <- int_of_string (Pcre.get_substring (Pcre.exec ~rex:num_comments_rex u) 1)
with
| _ -> ()
)
)
)
)
doc;
de