|
<?
include('config.php');
$url = $row['URL']; // gallery url
$ch = curl_init();
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5); // 5 seconds
curl_setopt($ch, CURLOPT_USERAGENT, "teksonline spider");
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_MAXREDIRS, 2);
$output = curl_exec($ch);
$response_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
// fixes a whitespace problem between tags on some gay html
$output = preg_replace ('/ {2,}/is', ' ', $output);
$array = parseHTML($output);
echo "{$row['ID']} $response_code: $url\n";
//echo $html[A][0][HREF]; example of html data is in array
echo"\n";
foreach($array['A'] as $href) {
/* hey im one of the href in the web page, do something with me as I am probably a picture link or a movie link
*/
}
function parseHtml($s_str) {
$i_indicatorL = 0;
$i_indicatorR = 0;
$s_tagOption = "";
$i_arrayCounter = 0;
$a_html = array();
// Search for a tag in string
while(is_int(($i_indicatorL=strpos($s_str,"<",$i_i ndicatorR))) ) {
// Get everything into tag...
$i_indicatorL++;
$i_indicatorR = strpos($s_str,">", $i_indicatorL);
$s_temp = substr($s_str, $i_indicatorL, ($i_indicatorR-$i_indicatorL) );
$a_tag = explode( ' ', $s_temp );
// Here we get the tag's name
list( ,$s_tagName,, ) = each($a_tag);
$s_tagName = strtoupper($s_tagName);
// Well, I am not interesting in <br>, </font> or anything else like that...
// So, this is false for tags without options.
$b_boolOptions = is_array(($s_tagOption=each($a_tag))) && $s_tagOption[1];
if( $b_boolOptions ) {
// Without this, we will mess up the array
$i_arrayCounter = (int)count($a_html[$s_tagName]);
// get the tag options, like src="htt://". Here, s_tagTokOption is 'src' and s_tagTokValue is '"http://"'
do {
$s_tagTokOption = strtoupper(strtok($s_tagOption[1], "="));
$s_tagTokValue = trim(strtok("="));
$a_html[$s_tagName][$i_arrayCounter][$s_tagTokOption] = $s_tagTokValue;
$b_boolOptions = is_array(($s_tagOption=each($a_tag))) && $s_tagOption[1];
} while( $b_boolOptions );
}
}
return $a_html;
}
?>
there you are half way home
|