annotate goodreads/goodreadsReviews.php @ 9:232deb0b066a

tidy up debugging, only show parms on non-throttled error
author Henry S. Thompson <ht@markup.co.uk>
date Sat, 18 Feb 2017 11:38:36 +0000
parents 077b0a0a3e6d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
1 <?php
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
2 include_once('simple_html_dom.php');
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
3
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
4 $isbn=$_GET['isbn'];
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
5
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
6 function scraping_goodreads($url) {
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
7 // create HTML DOM
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
8 $html = file_get_html($url);
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
9
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
10 $loop = 0;
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
11 $ret = "<?xml version=\"1.0\"?>";
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
12 $ret .= "<Reviews>";
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
13 foreach($html->find('div[class="gr_review_text"]') as $review) {
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
14 $rating = $html->find('<span[class="gr_rating"]',$loop);
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
15 $link = $html->find('<a[class="gr_more_link"]',$loop);
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
16 $ret .= "<Review>";
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
17 $ret .= "<Rating>" . substr( $rating->plaintext, 0, 1) . "</Rating>";
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
18 $ret .= "<Body>" . trim(substr(trim($review->plaintext),0,-7)) . "</Body>";
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
19 $ret .= "<URL><![CDATA[" . $link . "]]></URL>";
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
20 $ret .= "</Review>";
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
21 $loop++;
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
22 }
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
23
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
24 $ret .= "</Reviews>";
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
25 $html->clear();
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
26 unset($html);
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
27 $ret = preg_replace("/[^[:alnum:][:punct:][:space:]]/","",$ret);
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
28 echo convert_utf8($ret);
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
29 }
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
30
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
31 function convert_utf8($str){
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
32 if(!seems_utf8($str))
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
33 return mb_convert_encoding($str, 'UTF-8');
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
34 return $str;
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
35 }
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
36
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
37 function seems_utf8($Str) {
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
38 for ($i=0; $i<strlen($Str); $i++) {
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
39 if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
40 elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
41 elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
42 elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
43 elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
44 elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
45 else return false; # Does not match any model
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
46 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
47 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
48 return false;
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
49 }
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
50 }
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
51 return true;
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
52 }
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
53
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
54 $ret = scraping_goodreads('http://www.goodreads.com/api/reviews_widget_iframe?did=8vxmKmAn8AVolobtvBbMIg&isbn=' . $isbn);
077b0a0a3e6d remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff changeset
55 ?>