Mercurial > hg > ywww
diff goodreads/goodreadsReviews.php @ 6:077b0a0a3e6d
remaining originals according to dependency walk
author | Robert Boland <robert@markup.co.uk> |
---|---|
date | Thu, 16 Feb 2017 22:29:02 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/goodreads/goodreadsReviews.php Thu Feb 16 22:29:02 2017 +0000 @@ -0,0 +1,55 @@ +<?php +include_once('simple_html_dom.php'); + +$isbn=$_GET['isbn']; + +function scraping_goodreads($url) { + // create HTML DOM + $html = file_get_html($url); + + $loop = 0; + $ret = "<?xml version=\"1.0\"?>"; + $ret .= "<Reviews>"; + foreach($html->find('div[class="gr_review_text"]') as $review) { + $rating = $html->find('<span[class="gr_rating"]',$loop); + $link = $html->find('<a[class="gr_more_link"]',$loop); + $ret .= "<Review>"; + $ret .= "<Rating>" . substr( $rating->plaintext, 0, 1) . "</Rating>"; + $ret .= "<Body>" . trim(substr(trim($review->plaintext),0,-7)) . "</Body>"; + $ret .= "<URL><![CDATA[" . $link . "]]></URL>"; + $ret .= "</Review>"; + $loop++; + } + + $ret .= "</Reviews>"; + $html->clear(); + unset($html); + $ret = preg_replace("/[^[:alnum:][:punct:][:space:]]/","",$ret); + echo convert_utf8($ret); +} + +function convert_utf8($str){ + if(!seems_utf8($str)) + return mb_convert_encoding($str, 'UTF-8'); + return $str; +} + +function seems_utf8($Str) { + for ($i=0; $i<strlen($Str); $i++) { + if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb + elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb + elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb + elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb + elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb + elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b + else return false; # Does not match any model + for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? + if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) + return false; + } + } + return true; +} + +$ret = scraping_goodreads('http://www.goodreads.com/api/reviews_widget_iframe?did=8vxmKmAn8AVolobtvBbMIg&isbn=' . $isbn); +?> \ No newline at end of file