Mercurial > hg > ywww
view goodreads/goodreadsReviews.php @ 50:99a730ffeaf6
more parameterisation
author | Charlie Root |
---|---|
date | Sun, 27 Jan 2019 14:24:02 -0500 |
parents | 077b0a0a3e6d |
children |
line wrap: on
line source
<?php include_once('simple_html_dom.php'); $isbn=$_GET['isbn']; function scraping_goodreads($url) { // create HTML DOM $html = file_get_html($url); $loop = 0; $ret = "<?xml version=\"1.0\"?>"; $ret .= "<Reviews>"; foreach($html->find('div[class="gr_review_text"]') as $review) { $rating = $html->find('<span[class="gr_rating"]',$loop); $link = $html->find('<a[class="gr_more_link"]',$loop); $ret .= "<Review>"; $ret .= "<Rating>" . substr( $rating->plaintext, 0, 1) . "</Rating>"; $ret .= "<Body>" . trim(substr(trim($review->plaintext),0,-7)) . "</Body>"; $ret .= "<URL><![CDATA[" . $link . "]]></URL>"; $ret .= "</Review>"; $loop++; } $ret .= "</Reviews>"; $html->clear(); unset($html); $ret = preg_replace("/[^[:alnum:][:punct:][:space:]]/","",$ret); echo convert_utf8($ret); } function convert_utf8($str){ if(!seems_utf8($str)) return mb_convert_encoding($str, 'UTF-8'); return $str; } function seems_utf8($Str) { for ($i=0; $i<strlen($Str); $i++) { if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b else return false; # Does not match any model for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) return false; } } return true; } $ret = scraping_goodreads('http://www.goodreads.com/api/reviews_widget_iframe?did=8vxmKmAn8AVolobtvBbMIg&isbn=' . $isbn); ?>