caterpillar.php

<?php
// get current directory
if (!defined('CATERPILLAR_DIR')) define('CATERPILLAR_DIR', dirname(__FILE__));
// load the RollingCurl Request class
require(CATERPILLAR_DIR . '/inc/RollingCurl_Request.php');
// load the RollingCurl library
require(CATERPILLAR_DIR . '/inc/RollingCurl.php');
// set the memory limit
ini_set('memory_limit', '64M');

/**
 * The caterpillar class is intended to be used for
 * educational purposes only.
 *
 * @author      Corey Ballou
 * @copyright   Copyright (c) 2009, Corey Ballou
 * @license     http://www.apache.org/licenses/LICENSE-2.0
 * @link        http://www.jqueryin.com/projects/caterpillar
 * @version     1.0
 */
class Caterpillar extends RollingCurl {

	protected $startUrl;        // the starting URL of the crawler
	protected $domain;          // the domain path
	protected $ch;              // stores the curl handler
	protected $db;              // stores the database connection
	protected $maxDepth = 3;    // the maximum number of trailing forward slashes
	protected $startTime;       // the timestamp that caterpillar started

	protected $tempTable;			// the name of the temporary table
	protected $page_cache = array();	// the page cache for batching counts
	protected $pagesRequested = array();	// handles checking for first page requests

    /**
     *  Default constructor.
     *
     *  @access public
     *  @param  string  $startUrl   The starting url for crawling
     *  @param  string  $dbUser     The database username
     *  @param  string  $dbPass     The database password
     *  @param  string  $dbName     The database name
     *  @param  string  $dbHost     The database host
     *  @return void
     */
    public function __construct($startUrl, $dbUser, $dbPass, $dbName, $dbHost)
    {
	// register the Rolling Curl callback
	parent::__construct('parseHtml');

        // validate the url is valid HTTP or HTTPS
        if (strpos($startUrl, 'http://') === false &&
            strpos($startUrl, 'https://') === false) {
            throw new Exception('Your starting caterpillar URL must begin with http or https.');
        }

	// store the base url
	$this->startUrl = rtrim($startUrl, '/') . '/';

	// store the start time
	$this->startTime = date('Y-m-d H:i:s');

        // store the base domain
        $info = parse_url($startUrl);
        $this->domain = $info['scheme'] . '://' . $info['host'];

	// open mysql connection
	$this->db = mysql_connect($dbHost, $dbUser, $dbPass);
        if ($this->db === false) {
            throw new Exception('The caterpillar database connection could not be established');
        }

	// use user specified database
	if (!mysql_select_db($dbName)) {
            throw new Exception('An error occurred attempting to connect to the database ' . $dbName);
        }

    }

	/**
	 * Begins the crawling process.  Initially starts
	 * by removing all old crawled entries from the database.
	 *
	 * @access 	public
	 * @return	void
	 */
	public function crawl()
	{
        	// truncate any non-existant pages
		$this->resetIndex();

		// create a temporary table for incrementing counts
		$this->createTempTable();

		// add initial URL to crawl list
		$this->addRequest($this->startUrl);

		// begin crawling the url
		$this->crawlUrls();

		// update url counts and remove the temp table
		$this->finalizeUrlCounts();
	}

	/**
	 * Function to check whether a link already exists in the database or not.
	 * If the link exists the count is updated.
	 *
	 * @access	public
	 * @param	string	$link
	 * @param	string	$html
	 * @param	string	$filesize
	 * @return	bool
	 */
	public function checkUrlExists($link, $html, $filesize)
	{
		$sql = sprintf('SELECT SQL_NO_CACHE id, filesize, last_tested FROM crawl_index WHERE link = "%s"', mysql_real_escape_string($link));
		$res = mysql_query($sql, $this->db);
		if (mysql_num_rows($res) > 0) {
			$res = mysql_fetch_assoc($res);
			if ($res['last_tested'] == $this->startTime) {
				$this->updateInboundCount($res);
			} else {
				$this->updateInboundCount($res, $html, $filesize);
			}

			return true;
		}

		return false;
	}

	/**
	 * Function to update the inbound link count of a URL.
	 *
	 * @access	protected
	 * @param	mixed		$page
	 * @param	string		$html
	 * @param	mixed		$filesize
	 */
	protected function updateInboundCount($page, $html = '', $filesize = null)
	{
		// no filesize to check because its already been updated
		if ($filesize === null) {
			if (count($this->page_cache) < 1000) {
				$this->page_cache[] = (is_array($page)) ? $page['id'] : $page;
				return true;
			} else {
				// bulk insert rows
				$sql = sprintf('INSERT INTO %s (page_id) VALUES (%s)', $this->tempTable, implode('),(', $this->page_cache));
				// reset cache to empty state
				$this->page_cache = array();
			}
		}

		// filesize hasnt changed
		else if ($page['filesize'] == $filesize) {
			$sql = sprintf('UPDATE crawl_index SET
							`count` = `count` + 1,
							last_tested = "%s"
							WHERE id = %d',
							mysql_real_escape_string($this->startTime),
							$page['id']);
		}

		// filesize has changed since last check
		else {
			$sql = sprintf('UPDATE crawl_index SET
							`count` = `count` + 1,
							filesize = %1$d,
							contenthash = "%2$s",
							last_update = "%3$s",
							last_tested = "%3$s"
							WHERE id = %4$d',
							$filesize,
							mysql_real_escape_string($this->_crc32($html)),
							mysql_real_escape_string($this->startTime),
							$page['id']);
		}

		return mysql_query($sql);
	}

	/**
	 * Checks for a link's last_tested time to see if it has already
	 * been crawled during this session.
	 *
	 * @access	protected
	 * @param	string		$link
	 */
	protected function checkIfFirstRequest($link)
	{
		// simple cache to avoid db
		if (isset($this->pagesRequested[$link])) {
			return $this->pagesRequested[$link];
		}

		// cache miss, need to check
		$sql = sprintf('SELECT SQL_NO_CACHE id, last_tested FROM crawl_index WHERE link = "%s"', mysql_real_escape_string($link));
		$res = mysql_query($sql, $this->db);
		if (mysql_num_rows($res) > 0) {
			$res = mysql_fetch_assoc($res);
			$this->pagesRequested[$link] = $res['id'];
			if ($res['last_tested'] != $this->startTime) {
				return true;
			} else {
				return $res['id'];
			}
		}

		return true;
	}

	/**
	 * Function to add a given URL to the crawled index table.
	 *
	 * @access 	public
	 * @param	string	$url
	 * @param	string	$html
	 * @param	string	$filesize
	 * @return	void
	 */
	public function addUrlToIndex($url, $html, $filesize)
	{
		// the link doesnt exist in the db, insert now
		$sql = sprintf('INSERT INTO crawl_index SET
				link = "%1$s",
				`count` = 1,
				contenthash = "%2$s",
				filesize = %3$d,
				last_update = "%4$s",
				last_tested = "%4$s"',
				mysql_real_escape_string($url),
				mysql_real_escape_string($this->_crc32($html)),
				$filesize,
				mysql_real_escape_string($this->startTime));

		mysql_query($sql, $this->db);
		return mysql_insert_id();
	}

	/**
	 * Callback function from Rolling Curl which parses the recently
	 * scraped page for more URLs.
	 *
	 * @access	protected
	 * @param	string	$url		The url requested
	 * @param	string	$html		The body content
	 * @param	int		$http_code	The returned HTTP code
	 * @param	string	$title		The document title
	 * @param	int		$filesize	The size of the downloaded file in bytes
	 */
	protected function parseHtml($url, $html, $http_code, $title, $filesize)
	{
		if ($http_code >= 200 && $http_code < 400 && !empty($html)) {

			// add URL to index (or update count)
			if (!$this->checkUrlExists($url, $html, $filesize)) {
				$this->addUrlToIndex($url, $html, $filesize);
			}

			// find all urls on the page
			$urlMatches = array();
			if (preg_match_all('/href="([^#"]*)"/i', $html, $urlMatches, PREG_PATTERN_ORDER)) {

				// garbage collect
				unset($html, $urlMatches[0]);

				// iterate over each link found on the page
				foreach ($urlMatches[1] as $k => $link) {

					$link = trim($link);
					if (strlen($link) === 0) $link = '/';

					// don't allow more than maxDepth forward slashes in the URL
					if ($this->maxDepth > 0
                        && strpos($link, 'http') === false
                        && substr_count($link, '/') > $this->maxDepth) {
						continue;
					}

					// check for a relative path starting with a forward slash
					if (strpos($link, 'http') === false && strpos($link, '/') === 0) {
						// update the link with the full domain path
                        $link = $this->domain . $link;
					}
                    // check for a same directory reference
                    else if (strpos($link, 'http') === false && strpos($link, '/') === false) {
						if (strpos($link, 'www.') !== false) continue;
                        $link = $this->domain . '/' . $link;
                    }
					// dont index email addresses
					else if (strpos($link, 'mailto:') !== false) {
						continue;
					}
					// skip link if it isnt on the same domain
					else if (strpos($link, $this->domain) === false) {
                        continue;
					}

					// only add request if this is the first time in this session
					$link_id = $this->checkIfFirstRequest($link);
					if ($link_id === true) {
						$this->addRequest($link);
					} else {
						// update the inbound link count
						$this->updateInboundCount($link_id);
					}

				}

				// garbage collect
				unset($urlMatches);

				// crawl any newly found URLs
				$this->crawlUrls();

			}

		}
	}

	/**
	 * Crawls the given URL and parses the data looking for
	 * any links.  If a link is found to be using the same
	 * domain name or is a relative URL, it is recursively
	 * parsed as well.
	 *
	 * @access	protected
	 * @return	void
	 */
	protected function crawlUrls()
	{
		// only execute if a cURL request is not running
		if (!$this->running) {
			$this->execute();
		}
	}

	/**
	 * Removes all old entries of the sitemap from the database.
	 *
	 * @access 	private
	 * @return	void
	 */
	private function resetIndex()
	{
		// remove any non-existant links
		$sql = sprintf('DELETE FROM crawl_index WHERE last_update < "%s"', date('Y-m-d H:i:s', strtotime('-2 weeks')));
		mysql_query($sql, $this->db);

		// update all inbound link counters to 1
		mysql_query('UPDATE crawl_index SET `count` = 1');

		// optimize the table due to the deletions
		mysql_query('OPTIMIZE TABLE crawl_index');
	}

	/**
	 * Creates a temporary table for handling page counters.
	 *
	 * @access	private
	 */
	private function createTempTable()
	{
		// generate a temp table name
		$this->tempTable = 'crawl_index_temp';

		// ensure the temp table doesnt exist
		$sql = sprintf('DROP TABLE IF EXISTS %s', $this->tempTable);
		mysql_query($sql, $this->db);

		// create the table and give an exclusive write lock
		$sql = sprintf('CREATE TEMPORARY TABLE %s (page_id INT(10) UNSIGNED NOT NULL) ENGINE=MEMORY', $this->tempTable);
		mysql_query($sql, $this->db);
	}

	/**
	 * Updates the inbound link counts.
	 *
	 * @access	private
	 */
	private function finalizeUrlCounts()
	{
		// batch process any remaining pages in the page cache
		if (!empty($this->page_cache)) {
			$sql = sprintf('INSERT INTO %s (page_id) VALUES (%s)', $this->tempTable, implode('),(', $this->page_cache));
			mysql_query($sql);
		}

		// update the counts
		$sql = sprintf('UPDATE crawl_index
						LEFT JOIN (SELECT page_id, COUNT(page_id) AS total FROM %s GROUP BY page_id) AS crawl_count
						ON (crawl_index.id = crawl_count.page_id)
						SET crawl_index.count = (crawl_index.count + IFNULL(crawl_count.total, 0))',
						$this->tempTable);

		mysql_query($sql);

		// delete the temporary table
		$sql = sprintf('DROP TABLE %s', $this->tempTable);
		mysql_query($sql);
	}

}
?>