<?php
if (isset($_REQUEST['query'])) {
if (!$_REQUEST['query'] == '') {
/* Setting the variables
A google Query looks like this:
so here are the variables I need for my query:
http://www.google.com/search?q=MYQUERY&start=MYSTART
¦-------------------GooglePrefix------¦-----query--¦suffix¦-counter--¦
*/
$GooglePrefix = "http://www.google.com/search?q=";
$query = urlencode($_REQUEST['query']);
$GoogleCountSuffix ="&start=";
// Enter any URLs you don't want to crawl into this array - sites that would never have adsense
$dontcrawlarray = array('wikipedia.org','google.com','amazon.com');
//-----------------------------
echo "Results for " . $_REQUEST['query'] . "<br /><br />";
/* Loop to get the Google result pages
While going through the loop, we build the query URL out of the parts and the loop counter
The results are stored in the $res variable.
Basically, we get the complete source code for each result page, and store ALL of them in one looong string.
*/
$loop = '';
while ($loop <= 10)
{
$completeURL = $GooglePrefix . $query . $GoogleCountSuffix . $loop;
$res = $res.webFetcher($completeURL); // we use the function webFetcher to get the page
$loop = $loop + 10;
}
/* Now we use regular expressions to filter the URLs out of the result pages
For this, the function "do_reg" is called, giving it the complete resultstring and the regular expression.
The returned value (an array of matches) is stored in $regx
*/
$resultURLs = do_reg($res, "/h3.class=r.*(http.*)\"/U");
/* Loop through the list of dontcrawlarray domains and remove them before we crawl */
foreach ($dontcrawlarray as $url) {
$resultURLs = array_ereg_search($url,$resultURLs);
}
/* Now we want to fetch all those URLs
Again, we use a loop for this. Some more explanations in the loop itself.
*/
for ($i = 0; $i < count($resultURLs); $i++) //we use the length of the returned array to count.
{
$text = $resultURLs[$i]; //$text is set to the item in the result we are at
$comp = webFetcher($text); //we get the page at the URL
if (preg_match("/google_ad/", $comp, $matches))
/* again, we use aregular expression function.
This time, we are looking for "google_ad", a code snippet that tells us that google ads are used in the page.
If found, this is true.
*/
{
echo "$text<br />";
}
}
}
}
function do_reg($text, $regex) //returns all the found matches in an array
{
preg_match_all($regex, $text, $regxresult, PREG_PATTERN_ORDER);
return $regresult = $regxresult[1];
}
function webFetcher($url)
{
/* This does exactly what it is named after - it fetches a page from the web, just give it the URL */
$crawl = curl_init(); //the curl library is initiated, the following lines set the curl variables
curl_setopt ($crawl, CURLOPT_URL, $url); //The URL is set
curl_setopt($crawl, CURLOPT_RETURNTRANSFER, 1); //Tells it to return the results in a variable
$resulting = $resulting.curl_exec($crawl); //curl is executed and the results stored in $resulting
curl_close($crawl); // closes the curl procedure.
return $result = $resulting;
}
function array_ereg_search($val, $array) {
/* This removes $val from $array if found - used to remove the dontcrawlarray URLs */
$return = array();
foreach($array as $v) {
if(!eregi($val, $v)) $return[] = $v;
}
return $return;
}
?>