Mercurial > hg > python
comparison unicode_hist.py @ 54:dd63412fc882
fix something???
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 29 May 2023 22:03:08 +0100 |
parents | 6faea25a69b3 |
children |
comparison
equal
deleted
inserted
replaced
53:91d71e9760e8 | 54:dd63412fc882 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # -*- coding: utf-8 -*- | 2 # -*- coding: utf-8 -*- |
3 | 3 # from https://github.com/usc-isi-i2/dig-unicode/blob/master/python/unicode-histogram.py |
4 import sys | 4 import sys |
5 import re | 5 import re, pprint |
6 try: | 6 try: |
7 import simplejson as json | 7 import simplejson as json |
8 except: | 8 except: |
9 import json | 9 import json |
10 | 10 |
12 import unicodedata | 12 import unicodedata |
13 | 13 |
14 from time import strftime, gmtime | 14 from time import strftime, gmtime |
15 | 15 |
16 """ | 16 """ |
17 12 December 2014 | |
18 for each of {body, title}: | |
19 the unicodeSignature is the sequence of >ascii codepoints, in order, space-separated | |
20 the unicodeCatalog is the bag of >ascii codepoints, sorted/agglomerated using space, comma-separated | |
21 the unicodeHistogram is a json-encoded python dict/json object mapping codepoint to count | |
22 | |
23 the unicodeBlockSignature is the sequence of block descriptors (of all >ascii), in order, space-separated | |
24 the unicodeBlockCatalog is the bag of block descriptors, sorted/agglomerated using space, comma-separated | |
25 the unicodeBlockHistogram is a json-encoded python dict/json object mapping block descriptor to count | 17 the unicodeBlockHistogram is a json-encoded python dict/json object mapping block descriptor to count |
26 | 18 |
27 the unicodeCategorySignature is the sequence of category descriptors (of all >ascii), in order, space-separated | |
28 the unicodeCategoryCatalog is the bag of category descriptors, sorted/agglomerated using space, comma-separated | |
29 the unicodeCategoryHistogram is a json-encoded python dict/json object mapping category descriptor to count | 19 the unicodeCategoryHistogram is a json-encoded python dict/json object mapping category descriptor to count |
30 | 20 |
31 where block and category descriptors are defined via | 21 where block and category descriptors are defined via |
32 # From http://stackoverflow.com/a/245072 | 22 # From http://stackoverflow.com/a/245072 |
33 # retrieved from http://unicode.org/Public/UNIDATA/Blocks.txt | 23 # retrieved from http://unicode.org/Public/UNIDATA/Blocks.txt |
48 def fmtCodepoint(codepoint, style): | 38 def fmtCodepoint(codepoint, style): |
49 return codepoint | 39 return codepoint |
50 | 40 |
51 def fmtMetadatum(metadatum, style): | 41 def fmtMetadatum(metadatum, style): |
52 def fmtValue(s): | 42 def fmtValue(s): |
53 return re.sub("[ -]", "_", re.sub(",", "", s)) | 43 return re.sub("[ -]", "_", re.sub(",", "", str(s))) |
54 | 44 |
55 if style=="category": | 45 if style=="category": |
56 category = categoryCodeDescription(unicodedata.category(metadatum)) | 46 category = categoryCodeDescription(unicodedata.category(metadatum)) |
57 # return "category:" + fmtValue(category) | 47 # return "category:" + fmtValue(category) |
58 return fmtValue(category) | 48 return fmtValue(category) |
290 'Tamil' | 280 'Tamil' |
291 >>> block(unichr(0xe0080)) | 281 >>> block(unichr(0xe0080)) |
292 | 282 |
293 ''' | 283 ''' |
294 | 284 |
295 #assert isinstance(ch, unicode) and len(ch) == 1, repr(ch) | 285 assert isinstance(ch, str) and len(ch) == 1, repr(ch) |
296 cp = ord(ch) | 286 cp = ord(ch) |
297 for start, end, name in _blocks: | 287 for start, end, name in _blocks: |
298 if start <= cp <= end: | 288 if start <= cp <= end: |
299 return name | 289 return name |
300 | 290 |
334 'Zs': "Separator, Space"} | 324 'Zs': "Separator, Space"} |
335 | 325 |
336 def categoryCodeDescription(category): | 326 def categoryCodeDescription(category): |
337 return categoryCodeDescriptions.get(category, "Not Available") | 327 return categoryCodeDescriptions.get(category, "Not Available") |
338 | 328 |
339 def analyze(part): | 329 codepointHisto = Counter() |
340 content = part["text"] | 330 categoryHisto = Counter() |
341 codepointSeq = [] | 331 blockHisto = Counter() |
342 categorySeq = [] | 332 dd={'codepoints':codepointHisto, |
343 blockSeq = [] | 333 'categories':categoryHisto, |
344 codepointHisto = Counter() | 334 'blocks':blockHisto} |
345 categoryHisto = Counter() | 335 |
346 blockHisto = Counter() | 336 def analyze(content): |
347 for c in content: | 337 for c in content: |
348 if not isAscii(c): | 338 codepointHisto[c] += 1 |
349 codepointHisto[c] += 1 | 339 cat = fmtMetadatum(c, 'category') |
350 codepointSeq.append(c) | 340 blk = fmtMetadatum(c, 'block') |
351 cat = fmtMetadatum(c, 'category') | 341 if cat: |
352 blk = fmtMetadatum(c, 'block') | 342 categoryHisto[cat] += 1 |
353 if cat: | 343 if blk: |
354 categoryHisto[cat] += 1 | 344 blockHisto[blk] += 1 |
355 categorySeq.append(cat) | 345 return (codepointHisto, categoryHisto, blockHisto) |
356 if blk: | |
357 blockHisto[blk] += 1 | |
358 blockSeq.append(blk) | |
359 # Normal form KD | |
360 # presumed of minor importance: omitted for now | |
361 # categoryHisto["normalized:" + unicodedata.normalize(c.decode('utf-8'),'NFKD')] += 1 | |
362 contentElements = codepointSeq | |
363 # Histogram: JSON-encoded string repn of the dict | |
364 part["unicodeHistogram"] = json.dumps(codepointHisto) | |
365 # Signature: sequence of codepoints | |
366 part["unicodeSignature"] = " ".join(codepointSeq) | |
367 # Catalog: bag of codepoints | |
368 codepointCatalogElements = [] | |
369 for k in sorted(codepointHisto.keys()): | |
370 v = codepointHisto[k] | |
371 # v copies of this key | |
372 codepointCatalogElements.append(" ".join([k for _ in range(v)])) | |
373 part["unicodeCatalog"] = ", ".join(codepointCatalogElements) | |
374 | |
375 # Histogram: JSON-encoded string repn of the dict | |
376 part["unicodeCategoryHistogram"] = json.dumps(categoryHisto) | |
377 # Signature: sequence of codepoints | |
378 part["unicodeCategorySignature"] = " ".join(categorySeq) | |
379 # Catalog: bag of categories | |
380 categoryCatalogElements = [] | |
381 for k in sorted(categoryHisto.keys()): | |
382 v = categoryHisto[k] | |
383 # v copies of this key | |
384 categoryCatalogElements.append(" ".join([k for _ in range(v)])) | |
385 part["unicodeCategoryCatalog"] = ", ".join(categoryCatalogElements) | |
386 | |
387 # Histogram: JSON-encoded string repn of the dict | |
388 part["unicodeBlockHistogram"] = json.dumps(blockHisto) | |
389 # Signature: sequence of codepoints | |
390 part["unicodeBlockSignature"] = " ".join(blockSeq) | |
391 # Catalog: bag of blocks | |
392 blockCatalogElements = [] | |
393 for k in sorted(blockHisto.keys()): | |
394 v = blockHisto[k] | |
395 # v copies of this key | |
396 blockCatalogElements.append(" ".join([k for _ in range(v)])) | |
397 part["unicodeBlockCatalog"] = ", ".join(blockCatalogElements) | |
398 | |
399 return part | |
400 | 346 |
401 #Test data | 347 #Test data |
402 HEART = u'\u2665' | 348 # HEART = '\u2665' |
403 SMILY = u'\u263a' | 349 # SMILY = '\u263a' |
404 TSU = u'\u30C4' | 350 # TSU = '\u30C4' |
405 LEFT = u'\u27E8' | 351 # LEFT = '\u27E8' |
406 RIGHT = u'\u27E9' | 352 # RIGHT = '\u27E9' |
407 EURO = u'\u20AC' | 353 # EURO = '\u20AC' |
408 | 354 |
409 if True: | 355 # if True: |
410 | 356 |
411 TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT | 357 # TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT |
412 | 358 |
413 print(len(TESTUNICODE)) | 359 # print(len(TESTUNICODE)) |
414 print(json.dumps(TESTUNICODE)) | 360 # print(json.dumps(TESTUNICODE)) |
415 | 361 |
416 TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"} | 362 # TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"} |
417 | 363 |
418 analyze(TESTDOC["hasBodyPart"]) | 364 # res=analyze(TESTDOC["hasBodyPart"]["text"]) |
419 json.dump(TESTDOC, sys.stdout, indent=4); | 365 # print(res) |
420 exit(0) | 366 # exit(0) |
367 | |
421 | 368 |
422 for line in sys.stdin: | 369 for line in sys.stdin: |
423 try: | 370 try: |
424 (url, jrep) = line.split('\t') | 371 analyze(line) |
425 d = json.loads(jrep) | |
426 | |
427 analyze(d["hasBodyPart"]) | |
428 analyze(d["hasTitlePart"]) | |
429 # insert gmtime | |
430 # ensure it doesn't collide with any other gentime | |
431 d["unicodeGentime"] = gentime() | |
432 | |
433 print(url + "\t",end="") | |
434 json.dump(d, sys.stdout, sort_keys=True) | |
435 print() | |
436 except ValueError as e: | 372 except ValueError as e: |
437 print(e,file=sys.stderr) | 373 print(e,file=sys.stderr) |
438 pass | 374 pass |
375 for k in dd.keys(): | |
376 print(k) | |
377 pprint.pprint(sorted(list(dd[k].items()), | |
378 key=lambda e:e[1],reverse=True)) | |
379 | |
380 |