Mercurial > hg > python
changeset 54:dd63412fc882
fix something???
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 29 May 2023 22:03:08 +0100 |
parents | 91d71e9760e8 |
children | 68004ce55703 bc1acb1416ab |
files | unicode_hist.py |
diffstat | 1 files changed, 42 insertions(+), 100 deletions(-) [+] |
line wrap: on
line diff
--- a/unicode_hist.py Mon May 29 22:02:52 2023 +0100 +++ b/unicode_hist.py Mon May 29 22:03:08 2023 +0100 @@ -1,8 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - +# from https://github.com/usc-isi-i2/dig-unicode/blob/master/python/unicode-histogram.py import sys -import re +import re, pprint try: import simplejson as json except: @@ -14,18 +14,8 @@ from time import strftime, gmtime """ -12 December 2014 -for each of {body, title}: - the unicodeSignature is the sequence of >ascii codepoints, in order, space-separated - the unicodeCatalog is the bag of >ascii codepoints, sorted/agglomerated using space, comma-separated - the unicodeHistogram is a json-encoded python dict/json object mapping codepoint to count - - the unicodeBlockSignature is the sequence of block descriptors (of all >ascii), in order, space-separated - the unicodeBlockCatalog is the bag of block descriptors, sorted/agglomerated using space, comma-separated the unicodeBlockHistogram is a json-encoded python dict/json object mapping block descriptor to count - the unicodeCategorySignature is the sequence of category descriptors (of all >ascii), in order, space-separated - the unicodeCategoryCatalog is the bag of category descriptors, sorted/agglomerated using space, comma-separated the unicodeCategoryHistogram is a json-encoded python dict/json object mapping category descriptor to count where block and category descriptors are defined via @@ -50,7 +40,7 @@ def fmtMetadatum(metadatum, style): def fmtValue(s): - return re.sub("[ -]", "_", re.sub(",", "", s)) + return re.sub("[ -]", "_", re.sub(",", "", str(s))) if style=="category": category = categoryCodeDescription(unicodedata.category(metadatum)) @@ -292,7 +282,7 @@ ''' - #assert isinstance(ch, unicode) and len(ch) == 1, repr(ch) + assert isinstance(ch, str) and len(ch) == 1, repr(ch) cp = ord(ch) for start, end, name in _blocks: if start <= cp <= end: @@ -336,103 +326,55 @@ def categoryCodeDescription(category): return categoryCodeDescriptions.get(category, "Not Available") -def analyze(part): - content = part["text"] - codepointSeq = [] - categorySeq = [] - blockSeq = [] - codepointHisto = Counter() - categoryHisto = Counter() - blockHisto = Counter() +codepointHisto = Counter() +categoryHisto = Counter() +blockHisto = Counter() +dd={'codepoints':codepointHisto, + 'categories':categoryHisto, + 'blocks':blockHisto} + +def analyze(content): for c in content: - if not isAscii(c): - codepointHisto[c] += 1 - codepointSeq.append(c) - cat = fmtMetadatum(c, 'category') - blk = fmtMetadatum(c, 'block') - if cat: - categoryHisto[cat] += 1 - categorySeq.append(cat) - if blk: - blockHisto[blk] += 1 - blockSeq.append(blk) - # Normal form KD - # presumed of minor importance: omitted for now - # categoryHisto["normalized:" + unicodedata.normalize(c.decode('utf-8'),'NFKD')] += 1 - contentElements = codepointSeq - # Histogram: JSON-encoded string repn of the dict - part["unicodeHistogram"] = json.dumps(codepointHisto) - # Signature: sequence of codepoints - part["unicodeSignature"] = " ".join(codepointSeq) - # Catalog: bag of codepoints - codepointCatalogElements = [] - for k in sorted(codepointHisto.keys()): - v = codepointHisto[k] - # v copies of this key - codepointCatalogElements.append(" ".join([k for _ in range(v)])) - part["unicodeCatalog"] = ", ".join(codepointCatalogElements) - - # Histogram: JSON-encoded string repn of the dict - part["unicodeCategoryHistogram"] = json.dumps(categoryHisto) - # Signature: sequence of codepoints - part["unicodeCategorySignature"] = " ".join(categorySeq) - # Catalog: bag of categories - categoryCatalogElements = [] - for k in sorted(categoryHisto.keys()): - v = categoryHisto[k] - # v copies of this key - categoryCatalogElements.append(" ".join([k for _ in range(v)])) - part["unicodeCategoryCatalog"] = ", ".join(categoryCatalogElements) - - # Histogram: JSON-encoded string repn of the dict - part["unicodeBlockHistogram"] = json.dumps(blockHisto) - # Signature: sequence of codepoints - part["unicodeBlockSignature"] = " ".join(blockSeq) - # Catalog: bag of blocks - blockCatalogElements = [] - for k in sorted(blockHisto.keys()): - v = blockHisto[k] - # v copies of this key - blockCatalogElements.append(" ".join([k for _ in range(v)])) - part["unicodeBlockCatalog"] = ", ".join(blockCatalogElements) - - return part + codepointHisto[c] += 1 + cat = fmtMetadatum(c, 'category') + blk = fmtMetadatum(c, 'block') + if cat: + categoryHisto[cat] += 1 + if blk: + blockHisto[blk] += 1 + return (codepointHisto, categoryHisto, blockHisto) #Test data -HEART = u'\u2665' -SMILY = u'\u263a' -TSU = u'\u30C4' -LEFT = u'\u27E8' -RIGHT = u'\u27E9' -EURO = u'\u20AC' +# HEART = '\u2665' +# SMILY = '\u263a' +# TSU = '\u30C4' +# LEFT = '\u27E8' +# RIGHT = '\u27E9' +# EURO = '\u20AC' -if True: +# if True: - TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT +# TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT - print(len(TESTUNICODE)) - print(json.dumps(TESTUNICODE)) +# print(len(TESTUNICODE)) +# print(json.dumps(TESTUNICODE)) - TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"} +# TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"} - analyze(TESTDOC["hasBodyPart"]) - json.dump(TESTDOC, sys.stdout, indent=4); - exit(0) +# res=analyze(TESTDOC["hasBodyPart"]["text"]) +# print(res) +# exit(0) + for line in sys.stdin: try: - (url, jrep) = line.split('\t') - d = json.loads(jrep) - - analyze(d["hasBodyPart"]) - analyze(d["hasTitlePart"]) - # insert gmtime - # ensure it doesn't collide with any other gentime - d["unicodeGentime"] = gentime() - - print(url + "\t",end="") - json.dump(d, sys.stdout, sort_keys=True) - print() + analyze(line) except ValueError as e: print(e,file=sys.stderr) pass +for k in dd.keys(): + print(k) + pprint.pprint(sorted(list(dd[k].items()), + key=lambda e:e[1],reverse=True)) + +