OK, here is something I cobbled together
An abstracts crawl, taking abstracts from Google and Bing and counting words used in it, plus a stop list.
Really, really shoddily thrown together, but should give you an idea of the space you are operating in.
Several queries can be put together at once, use the humble , as a seperator.
Have fun.
An abstracts crawl, taking abstracts from Google and Bing and counting words used in it, plus a stop list.
Really, really shoddily thrown together, but should give you an idea of the space you are operating in.
Several queries can be put together at once, use the humble , as a seperator.
Have fun.
Code:
<HEAD>
<STYLE type="text/css">
html, body, div, span, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, a, abbr, acronym, address, code, del, dfn, em, img, q, dl, dt, dd, ol, ul, li, fieldset, form, label, legend, table, caption, tbody, tfoot, thead, tr, th, td {margin:0;padding:0;border:0;font-weight:inherit;font-style:inherit;font-size:100%;font-family:inherit;vertical-align:baseline;}
body {font-size:75%;color:#333;background:#fff;font-family:"Helvetica Neue", Arial, Helvetica, sans-serif;}
h1, h2, h3, h4, h5, h6 {font-weight:normal;color:#444;}
h1 {font-size:3em;line-height:1;margin-bottom:0.5em;}
h2 {font-size:2em;margin-bottom:0.75em;}
h3 {font-size:1.5em;line-height:1;margin-bottom:1em;}
h4 {font-size:1.2em;line-height:1.25;margin-bottom:1.25em;}
h5 {font-size:1em;font-weight:bold;margin-bottom:1.5em;}
h6 {font-size:1em;font-weight:bold;}
h1 img, h2 img, h3 img, h4 img, h5 img, h6 img {margin:0;}
p {margin:0 0 1.5em;}
.container {width:750px; margin:0 auto;}
.column, div.span-1, div.span-2, div.span-3, div.span-4, div.span-5, div.span-6, div.span-7, div.span-8, div.span-9, div.span-10, div.span-11, div.span-12, div.span-13, div.span-14, div.span-15, div.span-16, div.span-17, div.span-18, div.span-19, div.span-20, div.span-21, div.span-22, div.span-23, div.span-24 {float:left;margin-right:10px;}
.last, div.last {margin-right:0;}
.span-1 {width:30px;}
.span-2 {width:70px;}
.span-3 {width:110px;}
.span-4 {width:150px;}
.span-5 {width:190px;}
.span-15 {width:590px;}
.span-16 {width:630px;}
.span-19 {width:750px;}
</STYLE>
</HEAD>
<BODY>
<div class="container">
<div class="column span-19 last">
<h2>Abstracts Crawl</h2>
<form method="GET" action="GoogleAbstracts01.php">
<input type="textbox" name="queries"> Enter queries here, seperated by comma </input><br>
<input type="submit" name="submit">
</form>
</div>
<?php
if(empty($_GET['queries'])) {}
else
{
echo ('<div class="column span-19 last">');
//$GooglePrefix = "http://www.google.ch/search?hl=de&lr=lang_de&q=";
$GooglePrefix = "http://www.google.com/search?q=";
$GoogleCountSuffix ="&start=";
$GoogleRegx = "/div.class=.s..(.*)\<cite>/U";
$bingPrefix = "http://www.bing.com/search?q=";
$BingCountSuffix = "&first=";
$BingRegx = "/a>..h3>..div>.p>(.*).\/p>.div.class..sb.meta/U";
$BingSuffix ="&setmkt=en-WW&setlang=match";
$queries = explode(',', $_GET['queries']);
//print_r($query);
$resultat = '';
for ($j = 0; $j < count($queries); $j++)
{
echo "<h1>Google</h1>Suche nach: ".$queries[$j]."<br>";
$resultat = $resultat.getAbstracts(urlencode($queries[$j]), $GooglePrefix, $GoogleCountSuffix, "", $GoogleRegx);
SEoutput($resultat);
$resultat ='';
echo "<h1>Bing</h1>Suche nach: ".$queries[$j]."<br>";
$resultat .= getAbstracts(urlencode($queries[$j]), $bingPrefix, $BingCountSuffix, $BingSuffix, $BingRegx);
SEoutput($resultat);
}
echo ("</div>");
}
function SEoutput($resultat)
{
echo "<h3>Abstracts</h3></div>";
//echo $resultat;
$index = index_page(strip_tags($resultat));
$stopwords = array("the", "this", "then", "a", "and", "i", "of", "or", "to", "on", "with", "is", "all", "for", "in", "you", "me", "an", "as", "are" ,"we", "be", "can", "your", "it", "do", "how", "that", "what", "will", "was", "he", "search", "may", "at", "from", "about", "any", "by", "has", "there", "no", "yes");
echo ("<div class=column span-19 last>");
echo "<h3>Word Counts</h3>";
for ($i = 0; $i < count($index); $i++)
{
if ($index[$i]['count']>2 AND !(in_array($index[$i]['word'], $stopwords) ) )
{
echo ('<div class="column span-3">');
echo "word: ".$index[$i]['word']."</div>";
echo ('<div class="column span-15 last">');
echo ('count: '.$index[$i]['count']."</div>");
}
}
}
function getAbstracts($query, $Prefix, $CountSuffix, $Suffix, $regX )
{
$loop = 0;
$result ='';
$resultat = '';
while ($loop<= 10)
{
$CompleteUrl = $Prefix.$query.$CountSuffix.$loop.$Suffix;
$crawl = curl_init();
curl_setopt ($crawl, CURLOPT_URL, $CompleteUrl);
curl_setopt($crawl, CURLOPT_RETURNTRANSFER, 1);
$result = $result.curl_exec($crawl);
curl_close($crawl);
$loop = $loop+10;
}
$regxResult = do_reg($result, $regX);
for ($i = 1; $i < count($regxResult); $i++)
{
$clean = strip_tags ($regxResult[$i]);
$resultat=$resultat."<div class='column span-1'>".$i."</div><div class='column span-18 last'>".$clean."</div>";
}
return $resultat;
}
function do_reg($text, $regex)
{
preg_match_all($regex, $text, $regxresult, PREG_PATTERN_ORDER);
return $regresult = $regxresult[1];
}
function index_page($file) {
$index = array();
$find = array(
'/\r/',
'/\n/',
'/\s\s+/'
);
$replace = array(
' ',
' ',
' '
);
$work = $file;
$work = preg_replace('/[>][<]/', '> <', $work);
$work = strip_tags($work);
$work = strtolower($work);
$work = preg_replace($find, $replace, $work);
$work = trim($work);
$work = explode(' ', $work);
natcasesort($work);
$i = 0;
foreach($work as $word) {
$word = trim($word);
$junk = preg_match('/[^a-zA-Z]/', $word);
if($junk == 1) {
$word = '';
}
if( (!empty($word)) && ($word != '') ) {
if(!isset($index[$i]['word'])) { // if not set this is a new index
$index[$i]['word'] = $word;
$index[$i]['count'] = 1;
} elseif( $index[$i]['word'] == $word ) { // count repeats
$index[$i]['count'] += 1;
} else { // else this is a different word, increment $i and create an entry
$i++;
$index[$i]['word'] = $word;
$index[$i]['count'] = 1;
}
}
}
unset($work);
return($index);
}
?>
</BODY>