Mercurial > hg > python
annotate unicode-histogram.py @ 69:157f012ffab7 default tip
from local
author | Henry S Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 17 Jan 2025 15:45:26 +0000 |
parents | 91d71e9760e8 |
children |
rev | line source |
---|---|
48
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # from https://github.com/usc-isi-i2/dig-unicode/blob/master/python/unicode-histogram.py |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 import sys |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 import re |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 try: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 import simplejson as json |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 except: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 import json |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 from collections import Counter |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 import unicodedata |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 from time import strftime, gmtime |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 """ |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 12 December 2014 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 for each of {body, title}: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 the unicodeSignature is the sequence of >ascii codepoints, in order, space-separated |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 the unicodeCatalog is the bag of >ascii codepoints, sorted/agglomerated using space, comma-separated |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 the unicodeHistogram is a json-encoded python dict/json object mapping codepoint to count |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 the unicodeBlockSignature is the sequence of block descriptors (of all >ascii), in order, space-separated |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 the unicodeBlockCatalog is the bag of block descriptors, sorted/agglomerated using space, comma-separated |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 the unicodeBlockHistogram is a json-encoded python dict/json object mapping block descriptor to count |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 the unicodeCategorySignature is the sequence of category descriptors (of all >ascii), in order, space-separated |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 the unicodeCategoryCatalog is the bag of category descriptors, sorted/agglomerated using space, comma-separated |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 the unicodeCategoryHistogram is a json-encoded python dict/json object mapping category descriptor to count |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 where block and category descriptors are defined via |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 # From http://stackoverflow.com/a/245072 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 # retrieved from http://unicode.org/Public/UNIDATA/Blocks.txt |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 # Blocks-5.1.0.txt |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 # Date: 2008-03-20, 17:41:00 PDT [KW] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 and is formatted to using _ rather than ,/space/- |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 """ |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 def isAscii(c): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 try: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 return ord(c) <= 127 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 except: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 return False |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 def gentime(): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 return strftime("%Y-%m-%d %H:%M:%S", gmtime()) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 def fmtCodepoint(codepoint, style): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 return codepoint |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 def fmtMetadatum(metadatum, style): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 def fmtValue(s): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 return re.sub("[ -]", "_", re.sub(",", "", unicode(s))) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
55 if style=="category": |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
56 category = categoryCodeDescription(unicodedata.category(metadatum)) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
57 # return "category:" + fmtValue(category) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
58 return fmtValue(category) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
59 elif style=="block": |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
60 # return "block:" + fmtValue(block(metadatum)) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
61 return fmtValue(block(metadatum)) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
62 else: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
63 return None |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
64 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
65 # From http://stackoverflow.com/a/245072 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
66 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
67 _blocks = [] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
68 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
69 def _initBlocks(text): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
70 pattern = re.compile(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)') |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
71 for line in text.splitlines(): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
72 m = pattern.match(line) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
73 if m: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
74 start, end, name = m.groups() |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
75 _blocks.append((int(start, 16), int(end, 16), name)) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
76 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
77 # retrieved from http://unicode.org/Public/UNIDATA/Blocks.txt |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
78 _initBlocks(''' |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
79 # Blocks-5.1.0.txt |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
80 # Date: 2008-03-20, 17:41:00 PDT [KW] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
81 # |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
82 # Unicode Character Database |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
83 # Copyright (c) 1991-2008 Unicode, Inc. |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
84 # For terms of use, see http://www.unicode.org/terms_of_use.html |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
85 # For documentation, see UCD.html |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
86 # |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
87 # Note: The casing of block names is not normative. |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
88 # For example, "Basic Latin" and "BASIC LATIN" are equivalent. |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
89 # |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
90 # Format: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
91 # Start Code..End Code; Block Name |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
92 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
93 # ================================================ |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
94 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
95 # Note: When comparing block names, casing, whitespace, hyphens, |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
96 # and underbars are ignored. |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
97 # For example, "Latin Extended-A" and "latin extended a" are equivalent. |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
98 # For more information on the comparison of property values, |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
99 # see UCD.html. |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
100 # |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
101 # All code points not explicitly listed for Block |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
102 # have the value No_Block. |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
103 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
104 # Property: Block |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
105 # |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
106 # @missing: 0000..10FFFF; No_Block |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
107 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
108 0000..007F; Basic Latin |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
109 0080..00FF; Latin-1 Supplement |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
110 0100..017F; Latin Extended-A |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
111 0180..024F; Latin Extended-B |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
112 0250..02AF; IPA Extensions |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
113 02B0..02FF; Spacing Modifier Letters |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
114 0300..036F; Combining Diacritical Marks |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
115 0370..03FF; Greek and Coptic |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
116 0400..04FF; Cyrillic |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
117 0500..052F; Cyrillic Supplement |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
118 0530..058F; Armenian |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
119 0590..05FF; Hebrew |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
120 0600..06FF; Arabic |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
121 0700..074F; Syriac |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
122 0750..077F; Arabic Supplement |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
123 0780..07BF; Thaana |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
124 07C0..07FF; NKo |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
125 0900..097F; Devanagari |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
126 0980..09FF; Bengali |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
127 0A00..0A7F; Gurmukhi |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
128 0A80..0AFF; Gujarati |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
129 0B00..0B7F; Oriya |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
130 0B80..0BFF; Tamil |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
131 0C00..0C7F; Telugu |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
132 0C80..0CFF; Kannada |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
133 0D00..0D7F; Malayalam |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
134 0D80..0DFF; Sinhala |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
135 0E00..0E7F; Thai |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
136 0E80..0EFF; Lao |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
137 0F00..0FFF; Tibetan |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
138 1000..109F; Myanmar |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
139 10A0..10FF; Georgian |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
140 1100..11FF; Hangul Jamo |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
141 1200..137F; Ethiopic |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
142 1380..139F; Ethiopic Supplement |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
143 13A0..13FF; Cherokee |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
144 1400..167F; Unified Canadian Aboriginal Syllabics |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
145 1680..169F; Ogham |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
146 16A0..16FF; Runic |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
147 1700..171F; Tagalog |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
148 1720..173F; Hanunoo |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
149 1740..175F; Buhid |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
150 1760..177F; Tagbanwa |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
151 1780..17FF; Khmer |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
152 1800..18AF; Mongolian |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
153 1900..194F; Limbu |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
154 1950..197F; Tai Le |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
155 1980..19DF; New Tai Lue |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
156 19E0..19FF; Khmer Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
157 1A00..1A1F; Buginese |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
158 1B00..1B7F; Balinese |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
159 1B80..1BBF; Sundanese |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
160 1C00..1C4F; Lepcha |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
161 1C50..1C7F; Ol Chiki |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
162 1D00..1D7F; Phonetic Extensions |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
163 1D80..1DBF; Phonetic Extensions Supplement |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
164 1DC0..1DFF; Combining Diacritical Marks Supplement |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
165 1E00..1EFF; Latin Extended Additional |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
166 1F00..1FFF; Greek Extended |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
167 2000..206F; General Punctuation |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
168 2070..209F; Superscripts and Subscripts |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
169 20A0..20CF; Currency Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
170 20D0..20FF; Combining Diacritical Marks for Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
171 2100..214F; Letterlike Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
172 2150..218F; Number Forms |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
173 2190..21FF; Arrows |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
174 2200..22FF; Mathematical Operators |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
175 2300..23FF; Miscellaneous Technical |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
176 2400..243F; Control Pictures |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
177 2440..245F; Optical Character Recognition |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
178 2460..24FF; Enclosed Alphanumerics |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
179 2500..257F; Box Drawing |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
180 2580..259F; Block Elements |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
181 25A0..25FF; Geometric Shapes |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
182 2600..26FF; Miscellaneous Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
183 2700..27BF; Dingbats |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
184 27C0..27EF; Miscellaneous Mathematical Symbols-A |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
185 27F0..27FF; Supplemental Arrows-A |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
186 2800..28FF; Braille Patterns |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
187 2900..297F; Supplemental Arrows-B |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
188 2980..29FF; Miscellaneous Mathematical Symbols-B |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
189 2A00..2AFF; Supplemental Mathematical Operators |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
190 2B00..2BFF; Miscellaneous Symbols and Arrows |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
191 2C00..2C5F; Glagolitic |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
192 2C60..2C7F; Latin Extended-C |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
193 2C80..2CFF; Coptic |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
194 2D00..2D2F; Georgian Supplement |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
195 2D30..2D7F; Tifinagh |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
196 2D80..2DDF; Ethiopic Extended |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
197 2DE0..2DFF; Cyrillic Extended-A |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
198 2E00..2E7F; Supplemental Punctuation |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
199 2E80..2EFF; CJK Radicals Supplement |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
200 2F00..2FDF; Kangxi Radicals |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
201 2FF0..2FFF; Ideographic Description Characters |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
202 3000..303F; CJK Symbols and Punctuation |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
203 3040..309F; Hiragana |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
204 30A0..30FF; Katakana |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
205 3100..312F; Bopomofo |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
206 3130..318F; Hangul Compatibility Jamo |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
207 3190..319F; Kanbun |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
208 31A0..31BF; Bopomofo Extended |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
209 31C0..31EF; CJK Strokes |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
210 31F0..31FF; Katakana Phonetic Extensions |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
211 3200..32FF; Enclosed CJK Letters and Months |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
212 3300..33FF; CJK Compatibility |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
213 3400..4DBF; CJK Unified Ideographs Extension A |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
214 4DC0..4DFF; Yijing Hexagram Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
215 4E00..9FFF; CJK Unified Ideographs |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
216 A000..A48F; Yi Syllables |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
217 A490..A4CF; Yi Radicals |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
218 A500..A63F; Vai |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
219 A640..A69F; Cyrillic Extended-B |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
220 A700..A71F; Modifier Tone Letters |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
221 A720..A7FF; Latin Extended-D |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
222 A800..A82F; Syloti Nagri |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
223 A840..A87F; Phags-pa |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
224 A880..A8DF; Saurashtra |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
225 A900..A92F; Kayah Li |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
226 A930..A95F; Rejang |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
227 AA00..AA5F; Cham |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
228 AC00..D7AF; Hangul Syllables |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
229 D800..DB7F; High Surrogates |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
230 DB80..DBFF; High Private Use Surrogates |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
231 DC00..DFFF; Low Surrogates |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
232 E000..F8FF; Private Use Area |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
233 F900..FAFF; CJK Compatibility Ideographs |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
234 FB00..FB4F; Alphabetic Presentation Forms |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
235 FB50..FDFF; Arabic Presentation Forms-A |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
236 FE00..FE0F; Variation Selectors |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
237 FE10..FE1F; Vertical Forms |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
238 FE20..FE2F; Combining Half Marks |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
239 FE30..FE4F; CJK Compatibility Forms |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
240 FE50..FE6F; Small Form Variants |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
241 FE70..FEFF; Arabic Presentation Forms-B |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
242 FF00..FFEF; Halfwidth and Fullwidth Forms |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
243 FFF0..FFFF; Specials |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
244 10000..1007F; Linear B Syllabary |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
245 10080..100FF; Linear B Ideograms |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
246 10100..1013F; Aegean Numbers |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
247 10140..1018F; Ancient Greek Numbers |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
248 10190..101CF; Ancient Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
249 101D0..101FF; Phaistos Disc |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
250 10280..1029F; Lycian |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
251 102A0..102DF; Carian |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
252 10300..1032F; Old Italic |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
253 10330..1034F; Gothic |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
254 10380..1039F; Ugaritic |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
255 103A0..103DF; Old Persian |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
256 10400..1044F; Deseret |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
257 10450..1047F; Shavian |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
258 10480..104AF; Osmanya |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
259 10800..1083F; Cypriot Syllabary |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
260 10900..1091F; Phoenician |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
261 10920..1093F; Lydian |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
262 10A00..10A5F; Kharoshthi |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
263 12000..123FF; Cuneiform |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
264 12400..1247F; Cuneiform Numbers and Punctuation |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
265 1D000..1D0FF; Byzantine Musical Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
266 1D100..1D1FF; Musical Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
267 1D200..1D24F; Ancient Greek Musical Notation |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
268 1D300..1D35F; Tai Xuan Jing Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
269 1D360..1D37F; Counting Rod Numerals |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
270 1D400..1D7FF; Mathematical Alphanumeric Symbols |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
271 1F000..1F02F; Mahjong Tiles |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
272 1F030..1F09F; Domino Tiles |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
273 20000..2A6DF; CJK Unified Ideographs Extension B |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
274 2F800..2FA1F; CJK Compatibility Ideographs Supplement |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
275 E0000..E007F; Tags |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
276 E0100..E01EF; Variation Selectors Supplement |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
277 F0000..FFFFF; Supplementary Private Use Area-A |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
278 100000..10FFFF; Supplementary Private Use Area-B |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
279 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
280 # EOF |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
281 ''') |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
282 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
283 def block(ch): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
284 ''' |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
285 Return the Unicode block name for ch, or None if ch has no block. |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
286 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
287 >>> block(u'a') |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
288 'Basic Latin' |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
289 >>> block(unichr(0x0b80)) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
290 'Tamil' |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
291 >>> block(unichr(0xe0080)) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
292 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
293 ''' |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
294 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
295 assert isinstance(ch, unicode) and len(ch) == 1, repr(ch) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
296 cp = ord(ch) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
297 for start, end, name in _blocks: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
298 if start <= cp <= end: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
299 return name |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
300 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
301 categoryCodeDescriptions = {'Cc': "Other, Control", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
302 'Cf': "Other, Format", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
303 # 'Cn': "Other, Not Assigned (no characters in the file have this property)", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
304 'Cn': "Other, Not Assigned", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
305 'Co': "Other, Private Use", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
306 'Cs': "Other, Surrogate", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
307 'LC': "Letter, Cased", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
308 'Ll': "Letter, Lowercase", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
309 'Lm': "Letter, Modifier", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
310 'Lo': "Letter, Other", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
311 'Lt': "Letter, Titlecase", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
312 'Lu': "Letter, Uppercase", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
313 'Mc': "Mark, Spacing Combining", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
314 'Me': "Mark, Enclosing", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
315 'Mn': "Mark, Nonspacing", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
316 'Nd': "Number, Decimal Digit", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
317 'Nl': "Number, Letter", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
318 'No': "Number, Other", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
319 'Pc': "Punctuation, Connector", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
320 'Pd': "Punctuation, Dash", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
321 'Pe': "Punctuation, Close", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
322 # 'Pf': "Punctuation, Final quote (may behave like Ps or Pe depending on usage)", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
323 # 'Pi': "Punctuation, Initial quote (may behave like Ps or Pe depending on usage)", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
324 'Pf': "Punctuation, Final quote", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
325 'Pi': "Punctuation, Initial quote", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
326 'Po': "Punctuation, Other", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
327 'Ps': "Punctuation, Open", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
328 'Sc': "Symbol, Currency", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
329 'Sk': "Symbol, Modifier", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
330 'Sm': "Symbol, Math", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
331 'So': "Symbol, Other", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
332 'Zl': "Separator, Line", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
333 'Zp': "Separator, Paragraph", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
334 'Zs': "Separator, Space"} |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
335 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
336 def categoryCodeDescription(category): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
337 return categoryCodeDescriptions.get(category, "Not Available") |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
338 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
339 def analyze(part): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
340 content = part["text"] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
341 codepointSeq = [] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
342 categorySeq = [] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
343 blockSeq = [] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
344 codepointHisto = Counter() |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
345 categoryHisto = Counter() |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
346 blockHisto = Counter() |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
347 for c in content: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
348 if not isAscii(c): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
349 codepointHisto[c] += 1 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
350 codepointSeq.append(c) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
351 cat = fmtMetadatum(c, 'category') |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
352 blk = fmtMetadatum(c, 'block') |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
353 if cat: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
354 categoryHisto[cat] += 1 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
355 categorySeq.append(cat) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
356 if blk: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
357 blockHisto[blk] += 1 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
358 blockSeq.append(blk) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
359 # Normal form KD |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
360 # presumed of minor importance: omitted for now |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
361 # categoryHisto["normalized:" + unicodedata.normalize(c.decode('utf-8'),'NFKD')] += 1 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
362 contentElements = codepointSeq |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
363 # Histogram: JSON-encoded string repn of the dict |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
364 part["unicodeHistogram"] = json.dumps(codepointHisto) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
365 # Signature: sequence of codepoints |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
366 part["unicodeSignature"] = " ".join(codepointSeq) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
367 # Catalog: bag of codepoints |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
368 codepointCatalogElements = [] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
369 for k in sorted(codepointHisto.keys()): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
370 v = codepointHisto[k] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
371 # v copies of this key |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
372 codepointCatalogElements.append(" ".join([k for _ in xrange(v)])) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
373 part["unicodeCatalog"] = ", ".join(codepointCatalogElements) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
374 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
375 # Histogram: JSON-encoded string repn of the dict |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
376 part["unicodeCategoryHistogram"] = json.dumps(categoryHisto) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
377 # Signature: sequence of codepoints |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
378 part["unicodeCategorySignature"] = " ".join(categorySeq) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
379 # Catalog: bag of categories |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
380 categoryCatalogElements = [] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
381 for k in sorted(categoryHisto.keys()): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
382 v = categoryHisto[k] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
383 # v copies of this key |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
384 categoryCatalogElements.append(" ".join([k for _ in xrange(v)])) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
385 part["unicodeCategoryCatalog"] = ", ".join(categoryCatalogElements) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
386 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
387 # Histogram: JSON-encoded string repn of the dict |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
388 part["unicodeBlockHistogram"] = json.dumps(blockHisto) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
389 # Signature: sequence of codepoints |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
390 part["unicodeBlockSignature"] = " ".join(blockSeq) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
391 # Catalog: bag of blocks |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
392 blockCatalogElements = [] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
393 for k in sorted(blockHisto.keys()): |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
394 v = blockHisto[k] |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
395 # v copies of this key |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
396 blockCatalogElements.append(" ".join([k for _ in xrange(v)])) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
397 part["unicodeBlockCatalog"] = ", ".join(blockCatalogElements) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
398 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
399 return part |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
400 |
53
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
401 #Test data |
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
402 # HEART = u'\u2665' |
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
403 # SMILY = u'\u263a' |
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
404 # TSU = u'\u30C4' |
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
405 # LEFT = u'\u27E8' |
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
406 # RIGHT = u'\u27E9' |
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
407 # EURO = u'\u20AC' |
48
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
408 |
53
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
409 # if True: |
48
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
410 |
53
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
411 # TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT |
48
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
412 |
53
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
413 # print len(TESTUNICODE) |
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
414 # print json.dumps(TESTUNICODE) |
48
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
415 |
53
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
416 # TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"} |
48
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
417 |
53
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
418 # analyze(TESTDOC["hasBodyPart"]) |
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
419 # json.dump(TESTDOC, sys.stdout, indent=4); |
91d71e9760e8
forgot what this is about
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
420 # exit(0) |
48
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
421 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
422 for line in sys.stdin: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
423 try: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
424 (url, jrep) = line.split('\t') |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
425 d = json.loads(jrep) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
426 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
427 analyze(d["hasBodyPart"]) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
428 analyze(d["hasTitlePart"]) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
429 # insert gmtime |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
430 # ensure it doesn't collide with any other gentime |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
431 d["unicodeGentime"] = gentime() |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
432 |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
433 print url + "\t", |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
434 json.dump(d, sys.stdout, sort_keys=True) |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
435 print |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
436 except ValueError as e: |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
437 print >> sys.stderr, e |
99bfff1538c6
as downloaded, python2.7...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
438 pass |