Mercurial > hg > ywww
comparison goodreads/goodreadsReviews.php @ 6:077b0a0a3e6d
remaining originals according to dependency walk
| author | Robert Boland <robert@markup.co.uk> |
|---|---|
| date | Thu, 16 Feb 2017 22:29:02 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:55445b456ad0 | 6:077b0a0a3e6d |
|---|---|
| 1 <?php | |
| 2 include_once('simple_html_dom.php'); | |
| 3 | |
| 4 $isbn=$_GET['isbn']; | |
| 5 | |
| 6 function scraping_goodreads($url) { | |
| 7 // create HTML DOM | |
| 8 $html = file_get_html($url); | |
| 9 | |
| 10 $loop = 0; | |
| 11 $ret = "<?xml version=\"1.0\"?>"; | |
| 12 $ret .= "<Reviews>"; | |
| 13 foreach($html->find('div[class="gr_review_text"]') as $review) { | |
| 14 $rating = $html->find('<span[class="gr_rating"]',$loop); | |
| 15 $link = $html->find('<a[class="gr_more_link"]',$loop); | |
| 16 $ret .= "<Review>"; | |
| 17 $ret .= "<Rating>" . substr( $rating->plaintext, 0, 1) . "</Rating>"; | |
| 18 $ret .= "<Body>" . trim(substr(trim($review->plaintext),0,-7)) . "</Body>"; | |
| 19 $ret .= "<URL><![CDATA[" . $link . "]]></URL>"; | |
| 20 $ret .= "</Review>"; | |
| 21 $loop++; | |
| 22 } | |
| 23 | |
| 24 $ret .= "</Reviews>"; | |
| 25 $html->clear(); | |
| 26 unset($html); | |
| 27 $ret = preg_replace("/[^[:alnum:][:punct:][:space:]]/","",$ret); | |
| 28 echo convert_utf8($ret); | |
| 29 } | |
| 30 | |
| 31 function convert_utf8($str){ | |
| 32 if(!seems_utf8($str)) | |
| 33 return mb_convert_encoding($str, 'UTF-8'); | |
| 34 return $str; | |
| 35 } | |
| 36 | |
| 37 function seems_utf8($Str) { | |
| 38 for ($i=0; $i<strlen($Str); $i++) { | |
| 39 if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb | |
| 40 elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb | |
| 41 elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb | |
| 42 elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb | |
| 43 elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb | |
| 44 elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b | |
| 45 else return false; # Does not match any model | |
| 46 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? | |
| 47 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) | |
| 48 return false; | |
| 49 } | |
| 50 } | |
| 51 return true; | |
| 52 } | |
| 53 | |
| 54 $ret = scraping_goodreads('http://www.goodreads.com/api/reviews_widget_iframe?did=8vxmKmAn8AVolobtvBbMIg&isbn=' . $isbn); | |
| 55 ?> |
