forked from petewarden/buzzprofilecrawl
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparallelcurl.php
executable file
·150 lines (118 loc) · 5.57 KB
/
parallelcurl.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
<?php
// This class is designed to make it easy to run multiple curl requests in parallel, rather than
// waiting for each one to finish before starting the next. Under the hood it uses curl_multi_exec
// but since I find that interface painfully confusing, I wanted one that corresponded to the tasks
// that I wanted to run.
//
// To use it, first create the ParallelCurl object:
//
// $parallelcurl = new ParallelCurl(10);
//
// The first argument to the constructor is the maximum number of outstanding fetches to allow
// before blocking to wait for one to finish. You can change this later using setMaxRequests()
// The second optional argument is an array of curl options in the format used by curl_setopt_array()
//
// Next, start a URL fetch:
//
// $parallelcurl->startRequest('http://example.com', 'on_request_done', array('something'));
//
// The first argument is the address that should be fetched
// The second is the callback function that will be run once the request is done
// The third is a 'cookie', that can contain arbitrary data to be passed to the callback
//
// This startRequest call will return immediately, as long as less than the maximum number of
// requests are outstanding. Once the request is done, the callback function will be called, eg:
//
// on_request_done($content, 'http://example.com', $ch, array('something));
//
// The callback should take four arguments. The first is a string containing the content found at
// the URL. The second is the original URL requested, the third is the curl handle of the request that
// can be queried to get the results, and the fourth is the arbitrary 'cookie' value that you
// associated with this object. This cookie contains user-defined data.
//
// Since you may have requests outstanding at the end of your script, you *MUST* call
//
// $parallelcurl->finishAllRequests();
//
// before you exit. If you don't, the final requests may be left unprocessed!
//
// By Pete Warden <[email protected]>, freely reusable, see http://petewarden.typepad.com for more
class ParallelCurl {
public $max_requests;
public $options;
public $outstanding_requests;
public $multi_handle;
public function __construct($in_max_requests = 10, $in_options = array()) {
$this->max_requests = $in_max_requests;
$this->options = $in_options;
$this->outstanding_requests = array();
$this->multi_handle = curl_multi_init();
}
// Sets how many requests can be outstanding at once before we block and wait for one to
// finish before starting the next one
public function setMaxRequests($in_max_requests) {
$max_requests = $in_max_requests;
}
// Sets the options to pass to curl, using the format of curl_setopt_array()
public function setOptions($in_options) {
$options = $in_options;
}
// Start a fetch from the $url address, calling the $callback function passing the optional
// $user_data value. The callback should accept 3 arguments, the url, curl handle and user
// data, eg on_request_done($url, $ch, $user_data);
public function startRequest($url, $callback, $user_data = array()) {
$this->waitForOutstandingRequestsToDropBelow($this->max_requests);
$ch = curl_init();
curl_setopt_array($ch, $this->options);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_multi_add_handle($this->multi_handle, $ch);
$this->outstanding_requests[$ch] = array(
'url' => $url,
'callback' => $callback,
'user_data' => $user_data,
);
$this->checkForCompletedRequests();
}
// You *MUST* call this function at the end of your script. It waits for any running requests
// to complete, and calls their callback functions
public function finishAllRequests() {
$this->waitForOutstandingRequestsToDropBelow(1);
}
// Checks to see if any of the outstanding requests have finished
private function checkForCompletedRequests() {
// Call select to see if anything is waiting for us
if (curl_multi_select($this->multi_handle, 0.0) === -1)
return;
// Since something's waiting, give curl a chance to process it
do {
$mrc = curl_multi_exec($this->multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
// Now grab the information about the completed requests
while ($info = curl_multi_info_read($this->multi_handle)) {
$ch = $info['handle'];
if (!isset($this->outstanding_requests[$ch])) {
die("Error - handle wasn't found in requests: '$ch' in ".
print_r($this->outstanding_requests, true));
}
$request = $this->outstanding_requests[$ch];
$url = $request['url'];
$content = curl_multi_getcontent($ch);
$callback = $request['callback'];
$user_data = $request['user_data'];
$callback($content, $url, $ch, $user_data);
unset($this->outstanding_requests[$ch]);
curl_multi_remove_handle($this->multi_handle, $ch);
}
}
// Blocks until there's less than the specified number of requests outstanding
private function waitForOutstandingRequestsToDropBelow($max)
{
while (count($this->outstanding_requests)>=$max)
{
$this->checkForCompletedRequests();
sleep(1);
}
}
}
?>