Mercurial > hg > ywww
annotate goodreads/goodreadsReviews.php @ 9:232deb0b066a
tidy up debugging, only show parms on non-throttled error
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 18 Feb 2017 11:38:36 +0000 |
parents | 077b0a0a3e6d |
children |
rev | line source |
---|---|
6
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
1 <?php |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
2 include_once('simple_html_dom.php'); |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
3 |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
4 $isbn=$_GET['isbn']; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
5 |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
6 function scraping_goodreads($url) { |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
7 // create HTML DOM |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
8 $html = file_get_html($url); |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
9 |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
10 $loop = 0; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
11 $ret = "<?xml version=\"1.0\"?>"; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
12 $ret .= "<Reviews>"; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
13 foreach($html->find('div[class="gr_review_text"]') as $review) { |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
14 $rating = $html->find('<span[class="gr_rating"]',$loop); |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
15 $link = $html->find('<a[class="gr_more_link"]',$loop); |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
16 $ret .= "<Review>"; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
17 $ret .= "<Rating>" . substr( $rating->plaintext, 0, 1) . "</Rating>"; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
18 $ret .= "<Body>" . trim(substr(trim($review->plaintext),0,-7)) . "</Body>"; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
19 $ret .= "<URL><![CDATA[" . $link . "]]></URL>"; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
20 $ret .= "</Review>"; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
21 $loop++; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
22 } |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
23 |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
24 $ret .= "</Reviews>"; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
25 $html->clear(); |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
26 unset($html); |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
27 $ret = preg_replace("/[^[:alnum:][:punct:][:space:]]/","",$ret); |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
28 echo convert_utf8($ret); |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
29 } |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
30 |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
31 function convert_utf8($str){ |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
32 if(!seems_utf8($str)) |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
33 return mb_convert_encoding($str, 'UTF-8'); |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
34 return $str; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
35 } |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
36 |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
37 function seems_utf8($Str) { |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
38 for ($i=0; $i<strlen($Str); $i++) { |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
39 if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
40 elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
41 elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
42 elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
43 elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
44 elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
45 else return false; # Does not match any model |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
46 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
47 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
48 return false; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
49 } |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
50 } |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
51 return true; |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
52 } |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
53 |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
54 $ret = scraping_goodreads('http://www.goodreads.com/api/reviews_widget_iframe?did=8vxmKmAn8AVolobtvBbMIg&isbn=' . $isbn); |
077b0a0a3e6d
remaining originals according to dependency walk
Robert Boland <robert@markup.co.uk>
parents:
diff
changeset
|
55 ?> |