Mercurial > hg > python
annotate unicode_hist.py @ 69:157f012ffab7 default tip
from local
author | Henry S Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 17 Jan 2025 15:45:26 +0000 |
parents | dd63412fc882 |
children |
rev | line source |
---|---|
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
54 | 3 # from https://github.com/usc-isi-i2/dig-unicode/blob/master/python/unicode-histogram.py |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 import sys |
54 | 5 import re, pprint |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 try: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 import simplejson as json |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 except: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 import json |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 from collections import Counter |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 import unicodedata |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 from time import strftime, gmtime |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 """ |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 the unicodeBlockHistogram is a json-encoded python dict/json object mapping block descriptor to count |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 the unicodeCategoryHistogram is a json-encoded python dict/json object mapping category descriptor to count |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 where block and category descriptors are defined via |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 # From http://stackoverflow.com/a/245072 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 # retrieved from http://unicode.org/Public/UNIDATA/Blocks.txt |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 # Blocks-5.1.0.txt |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 # Date: 2008-03-20, 17:41:00 PDT [KW] |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 and is formatted to using _ rather than ,/space/- |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 """ |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 def isAscii(c): |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 try: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 return ord(c) <= 127 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 except: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 return False |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 def gentime(): |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 return strftime("%Y-%m-%d %H:%M:%S", gmtime()) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 def fmtCodepoint(codepoint, style): |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 return codepoint |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 def fmtMetadatum(metadatum, style): |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 def fmtValue(s): |
54 | 43 return re.sub("[ -]", "_", re.sub(",", "", str(s))) |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 if style=="category": |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 category = categoryCodeDescription(unicodedata.category(metadatum)) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 # return "category:" + fmtValue(category) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 return fmtValue(category) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 elif style=="block": |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 # return "block:" + fmtValue(block(metadatum)) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 return fmtValue(block(metadatum)) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 else: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 return None |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
55 # From http://stackoverflow.com/a/245072 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
56 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
57 _blocks = [] |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
58 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
59 def _initBlocks(text): |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
60 pattern = re.compile(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)') |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
61 for line in text.splitlines(): |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
62 m = pattern.match(line) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
63 if m: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
64 start, end, name = m.groups() |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
65 _blocks.append((int(start, 16), int(end, 16), name)) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
66 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
67 # retrieved from http://unicode.org/Public/UNIDATA/Blocks.txt |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
68 _initBlocks(''' |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
69 # Blocks-5.1.0.txt |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
70 # Date: 2008-03-20, 17:41:00 PDT [KW] |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
71 # |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
72 # Unicode Character Database |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
73 # Copyright (c) 1991-2008 Unicode, Inc. |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
74 # For terms of use, see http://www.unicode.org/terms_of_use.html |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
75 # For documentation, see UCD.html |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
76 # |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
77 # Note: The casing of block names is not normative. |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
78 # For example, "Basic Latin" and "BASIC LATIN" are equivalent. |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
79 # |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
80 # Format: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
81 # Start Code..End Code; Block Name |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
82 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
83 # ================================================ |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
84 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
85 # Note: When comparing block names, casing, whitespace, hyphens, |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
86 # and underbars are ignored. |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
87 # For example, "Latin Extended-A" and "latin extended a" are equivalent. |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
88 # For more information on the comparison of property values, |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
89 # see UCD.html. |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
90 # |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
91 # All code points not explicitly listed for Block |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
92 # have the value No_Block. |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
93 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
94 # Property: Block |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
95 # |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
96 # @missing: 0000..10FFFF; No_Block |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
97 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
98 0000..007F; Basic Latin |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
99 0080..00FF; Latin-1 Supplement |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
100 0100..017F; Latin Extended-A |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
101 0180..024F; Latin Extended-B |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
102 0250..02AF; IPA Extensions |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
103 02B0..02FF; Spacing Modifier Letters |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
104 0300..036F; Combining Diacritical Marks |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
105 0370..03FF; Greek and Coptic |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
106 0400..04FF; Cyrillic |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
107 0500..052F; Cyrillic Supplement |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
108 0530..058F; Armenian |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
109 0590..05FF; Hebrew |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
110 0600..06FF; Arabic |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
111 0700..074F; Syriac |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
112 0750..077F; Arabic Supplement |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
113 0780..07BF; Thaana |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
114 07C0..07FF; NKo |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
115 0900..097F; Devanagari |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
116 0980..09FF; Bengali |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
117 0A00..0A7F; Gurmukhi |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
118 0A80..0AFF; Gujarati |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
119 0B00..0B7F; Oriya |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
120 0B80..0BFF; Tamil |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
121 0C00..0C7F; Telugu |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
122 0C80..0CFF; Kannada |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
123 0D00..0D7F; Malayalam |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
124 0D80..0DFF; Sinhala |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
125 0E00..0E7F; Thai |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
126 0E80..0EFF; Lao |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
127 0F00..0FFF; Tibetan |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
128 1000..109F; Myanmar |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
129 10A0..10FF; Georgian |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
130 1100..11FF; Hangul Jamo |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
131 1200..137F; Ethiopic |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
132 1380..139F; Ethiopic Supplement |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
133 13A0..13FF; Cherokee |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
134 1400..167F; Unified Canadian Aboriginal Syllabics |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
135 1680..169F; Ogham |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
136 16A0..16FF; Runic |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
137 1700..171F; Tagalog |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
138 1720..173F; Hanunoo |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
139 1740..175F; Buhid |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
140 1760..177F; Tagbanwa |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
141 1780..17FF; Khmer |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
142 1800..18AF; Mongolian |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
143 1900..194F; Limbu |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
144 1950..197F; Tai Le |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
145 1980..19DF; New Tai Lue |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
146 19E0..19FF; Khmer Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
147 1A00..1A1F; Buginese |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
148 1B00..1B7F; Balinese |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
149 1B80..1BBF; Sundanese |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
150 1C00..1C4F; Lepcha |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
151 1C50..1C7F; Ol Chiki |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
152 1D00..1D7F; Phonetic Extensions |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
153 1D80..1DBF; Phonetic Extensions Supplement |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
154 1DC0..1DFF; Combining Diacritical Marks Supplement |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
155 1E00..1EFF; Latin Extended Additional |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
156 1F00..1FFF; Greek Extended |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
157 2000..206F; General Punctuation |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
158 2070..209F; Superscripts and Subscripts |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
159 20A0..20CF; Currency Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
160 20D0..20FF; Combining Diacritical Marks for Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
161 2100..214F; Letterlike Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
162 2150..218F; Number Forms |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
163 2190..21FF; Arrows |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
164 2200..22FF; Mathematical Operators |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
165 2300..23FF; Miscellaneous Technical |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
166 2400..243F; Control Pictures |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
167 2440..245F; Optical Character Recognition |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
168 2460..24FF; Enclosed Alphanumerics |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
169 2500..257F; Box Drawing |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
170 2580..259F; Block Elements |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
171 25A0..25FF; Geometric Shapes |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
172 2600..26FF; Miscellaneous Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
173 2700..27BF; Dingbats |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
174 27C0..27EF; Miscellaneous Mathematical Symbols-A |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
175 27F0..27FF; Supplemental Arrows-A |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
176 2800..28FF; Braille Patterns |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
177 2900..297F; Supplemental Arrows-B |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
178 2980..29FF; Miscellaneous Mathematical Symbols-B |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
179 2A00..2AFF; Supplemental Mathematical Operators |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
180 2B00..2BFF; Miscellaneous Symbols and Arrows |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
181 2C00..2C5F; Glagolitic |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
182 2C60..2C7F; Latin Extended-C |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
183 2C80..2CFF; Coptic |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
184 2D00..2D2F; Georgian Supplement |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
185 2D30..2D7F; Tifinagh |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
186 2D80..2DDF; Ethiopic Extended |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
187 2DE0..2DFF; Cyrillic Extended-A |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
188 2E00..2E7F; Supplemental Punctuation |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
189 2E80..2EFF; CJK Radicals Supplement |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
190 2F00..2FDF; Kangxi Radicals |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
191 2FF0..2FFF; Ideographic Description Characters |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
192 3000..303F; CJK Symbols and Punctuation |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
193 3040..309F; Hiragana |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
194 30A0..30FF; Katakana |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
195 3100..312F; Bopomofo |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
196 3130..318F; Hangul Compatibility Jamo |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
197 3190..319F; Kanbun |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
198 31A0..31BF; Bopomofo Extended |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
199 31C0..31EF; CJK Strokes |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
200 31F0..31FF; Katakana Phonetic Extensions |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
201 3200..32FF; Enclosed CJK Letters and Months |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
202 3300..33FF; CJK Compatibility |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
203 3400..4DBF; CJK Unified Ideographs Extension A |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
204 4DC0..4DFF; Yijing Hexagram Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
205 4E00..9FFF; CJK Unified Ideographs |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
206 A000..A48F; Yi Syllables |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
207 A490..A4CF; Yi Radicals |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
208 A500..A63F; Vai |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
209 A640..A69F; Cyrillic Extended-B |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
210 A700..A71F; Modifier Tone Letters |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
211 A720..A7FF; Latin Extended-D |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
212 A800..A82F; Syloti Nagri |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
213 A840..A87F; Phags-pa |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
214 A880..A8DF; Saurashtra |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
215 A900..A92F; Kayah Li |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
216 A930..A95F; Rejang |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
217 AA00..AA5F; Cham |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
218 AC00..D7AF; Hangul Syllables |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
219 D800..DB7F; High Surrogates |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
220 DB80..DBFF; High Private Use Surrogates |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
221 DC00..DFFF; Low Surrogates |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
222 E000..F8FF; Private Use Area |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
223 F900..FAFF; CJK Compatibility Ideographs |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
224 FB00..FB4F; Alphabetic Presentation Forms |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
225 FB50..FDFF; Arabic Presentation Forms-A |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
226 FE00..FE0F; Variation Selectors |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
227 FE10..FE1F; Vertical Forms |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
228 FE20..FE2F; Combining Half Marks |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
229 FE30..FE4F; CJK Compatibility Forms |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
230 FE50..FE6F; Small Form Variants |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
231 FE70..FEFF; Arabic Presentation Forms-B |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
232 FF00..FFEF; Halfwidth and Fullwidth Forms |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
233 FFF0..FFFF; Specials |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
234 10000..1007F; Linear B Syllabary |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
235 10080..100FF; Linear B Ideograms |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
236 10100..1013F; Aegean Numbers |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
237 10140..1018F; Ancient Greek Numbers |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
238 10190..101CF; Ancient Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
239 101D0..101FF; Phaistos Disc |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
240 10280..1029F; Lycian |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
241 102A0..102DF; Carian |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
242 10300..1032F; Old Italic |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
243 10330..1034F; Gothic |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
244 10380..1039F; Ugaritic |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
245 103A0..103DF; Old Persian |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
246 10400..1044F; Deseret |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
247 10450..1047F; Shavian |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
248 10480..104AF; Osmanya |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
249 10800..1083F; Cypriot Syllabary |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
250 10900..1091F; Phoenician |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
251 10920..1093F; Lydian |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
252 10A00..10A5F; Kharoshthi |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
253 12000..123FF; Cuneiform |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
254 12400..1247F; Cuneiform Numbers and Punctuation |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
255 1D000..1D0FF; Byzantine Musical Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
256 1D100..1D1FF; Musical Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
257 1D200..1D24F; Ancient Greek Musical Notation |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
258 1D300..1D35F; Tai Xuan Jing Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
259 1D360..1D37F; Counting Rod Numerals |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
260 1D400..1D7FF; Mathematical Alphanumeric Symbols |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
261 1F000..1F02F; Mahjong Tiles |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
262 1F030..1F09F; Domino Tiles |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
263 20000..2A6DF; CJK Unified Ideographs Extension B |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
264 2F800..2FA1F; CJK Compatibility Ideographs Supplement |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
265 E0000..E007F; Tags |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
266 E0100..E01EF; Variation Selectors Supplement |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
267 F0000..FFFFF; Supplementary Private Use Area-A |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
268 100000..10FFFF; Supplementary Private Use Area-B |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
269 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
270 # EOF |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
271 ''') |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
272 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
273 def block(ch): |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
274 ''' |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
275 Return the Unicode block name for ch, or None if ch has no block. |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
276 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
277 >>> block(u'a') |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
278 'Basic Latin' |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
279 >>> block(unichr(0x0b80)) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
280 'Tamil' |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
281 >>> block(unichr(0xe0080)) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
282 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
283 ''' |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
284 |
54 | 285 assert isinstance(ch, str) and len(ch) == 1, repr(ch) |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
286 cp = ord(ch) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
287 for start, end, name in _blocks: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
288 if start <= cp <= end: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
289 return name |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
290 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
291 categoryCodeDescriptions = {'Cc': "Other, Control", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
292 'Cf': "Other, Format", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
293 # 'Cn': "Other, Not Assigned (no characters in the file have this property)", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
294 'Cn': "Other, Not Assigned", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
295 'Co': "Other, Private Use", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
296 'Cs': "Other, Surrogate", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
297 'LC': "Letter, Cased", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
298 'Ll': "Letter, Lowercase", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
299 'Lm': "Letter, Modifier", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
300 'Lo': "Letter, Other", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
301 'Lt': "Letter, Titlecase", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
302 'Lu': "Letter, Uppercase", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
303 'Mc': "Mark, Spacing Combining", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
304 'Me': "Mark, Enclosing", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
305 'Mn': "Mark, Nonspacing", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
306 'Nd': "Number, Decimal Digit", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
307 'Nl': "Number, Letter", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
308 'No': "Number, Other", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
309 'Pc': "Punctuation, Connector", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
310 'Pd': "Punctuation, Dash", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
311 'Pe': "Punctuation, Close", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
312 # 'Pf': "Punctuation, Final quote (may behave like Ps or Pe depending on usage)", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
313 # 'Pi': "Punctuation, Initial quote (may behave like Ps or Pe depending on usage)", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
314 'Pf': "Punctuation, Final quote", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
315 'Pi': "Punctuation, Initial quote", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
316 'Po': "Punctuation, Other", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
317 'Ps': "Punctuation, Open", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
318 'Sc': "Symbol, Currency", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
319 'Sk': "Symbol, Modifier", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
320 'Sm': "Symbol, Math", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
321 'So': "Symbol, Other", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
322 'Zl': "Separator, Line", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
323 'Zp': "Separator, Paragraph", |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
324 'Zs': "Separator, Space"} |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
325 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
326 def categoryCodeDescription(category): |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
327 return categoryCodeDescriptions.get(category, "Not Available") |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
328 |
54 | 329 codepointHisto = Counter() |
330 categoryHisto = Counter() | |
331 blockHisto = Counter() | |
332 dd={'codepoints':codepointHisto, | |
333 'categories':categoryHisto, | |
334 'blocks':blockHisto} | |
335 | |
336 def analyze(content): | |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
337 for c in content: |
54 | 338 codepointHisto[c] += 1 |
339 cat = fmtMetadatum(c, 'category') | |
340 blk = fmtMetadatum(c, 'block') | |
341 if cat: | |
342 categoryHisto[cat] += 1 | |
343 if blk: | |
344 blockHisto[blk] += 1 | |
345 return (codepointHisto, categoryHisto, blockHisto) | |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
346 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
347 #Test data |
54 | 348 # HEART = '\u2665' |
349 # SMILY = '\u263a' | |
350 # TSU = '\u30C4' | |
351 # LEFT = '\u27E8' | |
352 # RIGHT = '\u27E9' | |
353 # EURO = '\u20AC' | |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
354 |
54 | 355 # if True: |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
356 |
54 | 357 # TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
358 |
54 | 359 # print(len(TESTUNICODE)) |
360 # print(json.dumps(TESTUNICODE)) | |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
361 |
54 | 362 # TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"} |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
363 |
54 | 364 # res=analyze(TESTDOC["hasBodyPart"]["text"]) |
365 # print(res) | |
366 # exit(0) | |
367 | |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
368 |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
369 for line in sys.stdin: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
370 try: |
54 | 371 analyze(line) |
47
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
372 except ValueError as e: |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
373 print(e,file=sys.stderr) |
6faea25a69b3
brute-force python3 coversion of unicode-histogram.py, q.v.
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
374 pass |
54 | 375 for k in dd.keys(): |
376 print(k) | |
377 pprint.pprint(sorted(list(dd[k].items()), | |
378 key=lambda e:e[1],reverse=True)) | |
379 | |
380 |