python: unicode_hist.py comparison

comparison unicode_hist.py @ 54:dd63412fc882

fix something???

author	Henry S. Thompson <ht@inf.ed.ac.uk>
date	Mon, 29 May 2023 22:03:08 +0100
parents	6faea25a69b3
children

comparison

equal deleted inserted replaced

-:91d71e9760e8
+:dd63412fc882
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+# from https://github.com/usc-isi-i2/dig-unicode/blob/master/python/unicode-histogram.py
 import sys
-import re
+import re, pprint
 try:
 import simplejson as json
 except:
 import json
 import unicodedata
 from time import strftime, gmtime
 """
-12 December 2014
-for each of {body, title}:
-the unicodeSignature is the sequence of >ascii codepoints, in order, space-separated
-the unicodeCatalog is the bag of >ascii codepoints, sorted/agglomerated using space, comma-separated
-the unicodeHistogram is a json-encoded python dict/json object mapping codepoint to count
-the unicodeBlockSignature is the sequence of block descriptors (of all >ascii), in order, space-separated
-the unicodeBlockCatalog is the bag of block descriptors, sorted/agglomerated using space, comma-separated
 the unicodeBlockHistogram is a json-encoded python dict/json object mapping block descriptor to count
-the unicodeCategorySignature is the sequence of category descriptors (of all >ascii), in order, space-separated
-the unicodeCategoryCatalog is the bag of category descriptors, sorted/agglomerated using space, comma-separated
 the unicodeCategoryHistogram is a json-encoded python dict/json object mapping category descriptor to count
 where block and category descriptors are defined via
 # From http://stackoverflow.com/a/245072
 # retrieved from http://unicode.org/Public/UNIDATA/Blocks.txt
 def fmtCodepoint(codepoint, style):
 return codepoint
 def fmtMetadatum(metadatum, style):
 def fmtValue(s):
-return re.sub("[ -]", "_", re.sub(",", "", s))
+return re.sub("[ -]", "_", re.sub(",", "", str(s)))
 if style=="category":
 category = categoryCodeDescription(unicodedata.category(metadatum))
 # return "category:" + fmtValue(category)
 return fmtValue(category)
 'Tamil'
 >>> block(unichr(0xe0080))
 '''
-#assert isinstance(ch, unicode) and len(ch) == 1, repr(ch)
+assert isinstance(ch, str) and len(ch) == 1, repr(ch)
 cp = ord(ch)
 for start, end, name in _blocks:
 if start <= cp <= end:
 return name
 'Zs': "Separator, Space"}
 def categoryCodeDescription(category):
 return categoryCodeDescriptions.get(category, "Not Available")
-def analyze(part):
+codepointHisto = Counter()
-content = part["text"]
+categoryHisto = Counter()
-codepointSeq = []
+blockHisto = Counter()
-categorySeq = []
+dd={'codepoints':codepointHisto,
-blockSeq = []
+'categories':categoryHisto,
-codepointHisto = Counter()
+'blocks':blockHisto}
-categoryHisto = Counter()
-blockHisto = Counter()
+def analyze(content):
 for c in content:
-if not isAscii(c):
+codepointHisto[c] += 1
-codepointHisto[c] += 1
+cat = fmtMetadatum(c, 'category')
-codepointSeq.append(c)
+blk = fmtMetadatum(c, 'block')
-cat = fmtMetadatum(c, 'category')
+if cat:
-blk = fmtMetadatum(c, 'block')
+categoryHisto[cat] += 1
-if cat:
+if blk:
-categoryHisto[cat] += 1
+blockHisto[blk] += 1
-categorySeq.append(cat)
+return (codepointHisto, categoryHisto, blockHisto)
-if blk:
-blockHisto[blk] += 1
-blockSeq.append(blk)
-# Normal form KD
-# presumed of minor importance: omitted for now
-# categoryHisto["normalized:" + unicodedata.normalize(c.decode('utf-8'),'NFKD')] += 1
-contentElements = codepointSeq
-# Histogram: JSON-encoded string repn of the dict
-part["unicodeHistogram"] = json.dumps(codepointHisto)
-# Signature: sequence of codepoints
-part["unicodeSignature"] = " ".join(codepointSeq)
-# Catalog: bag of codepoints
-codepointCatalogElements = []
-for k in sorted(codepointHisto.keys()):
-v = codepointHisto[k]
-# v copies of this key
-codepointCatalogElements.append(" ".join([k for _ in range(v)]))
-part["unicodeCatalog"] = ", ".join(codepointCatalogElements)
-# Histogram: JSON-encoded string repn of the dict
-part["unicodeCategoryHistogram"] = json.dumps(categoryHisto)
-# Signature: sequence of codepoints
-part["unicodeCategorySignature"] = " ".join(categorySeq)
-# Catalog: bag of categories
-categoryCatalogElements = []
-for k in sorted(categoryHisto.keys()):
-v = categoryHisto[k]
-# v copies of this key
-categoryCatalogElements.append(" ".join([k for _ in range(v)]))
-part["unicodeCategoryCatalog"] = ", ".join(categoryCatalogElements)
-# Histogram: JSON-encoded string repn of the dict
-part["unicodeBlockHistogram"] = json.dumps(blockHisto)
-# Signature: sequence of codepoints
-part["unicodeBlockSignature"] = " ".join(blockSeq)
-# Catalog: bag of blocks
-blockCatalogElements = []
-for k in sorted(blockHisto.keys()):
-v = blockHisto[k]
-# v copies of this key
-blockCatalogElements.append(" ".join([k for _ in range(v)]))
-part["unicodeBlockCatalog"] = ", ".join(blockCatalogElements)
-return part
 #Test data
-HEART = u'\u2665'
+# HEART = '\u2665'
-SMILY = u'\u263a'
+# SMILY = '\u263a'
-TSU = u'\u30C4'
+# TSU = '\u30C4'
-LEFT = u'\u27E8'
+# LEFT = '\u27E8'
-RIGHT = u'\u27E9'
+# RIGHT = '\u27E9'
-EURO = u'\u20AC'
+# EURO = '\u20AC'
-if True:
+# if True:
-TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT
+#    TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT
-print(len(TESTUNICODE))
+#    print(len(TESTUNICODE))
-print(json.dumps(TESTUNICODE))
+#    print(json.dumps(TESTUNICODE))
-TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"}
+#    TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"}
-analyze(TESTDOC["hasBodyPart"])
+#    res=analyze(TESTDOC["hasBodyPart"]["text"])
-json.dump(TESTDOC, sys.stdout, indent=4);
+#    print(res)
-exit(0)
+#    exit(0)
 for line in sys.stdin:
 try:
-(url, jrep) = line.split('\t')
+analyze(line)
-d = json.loads(jrep)
-analyze(d["hasBodyPart"])
-analyze(d["hasTitlePart"])
-# insert gmtime
-# ensure it doesn't collide with any other gentime
-d["unicodeGentime"] = gentime()
-print(url + "\t",end="")
-json.dump(d, sys.stdout, sort_keys=True)
-print()
 except ValueError as e:
 print(e,file=sys.stderr)
 pass
+for k in dd.keys():
+print(k)
+pprint.pprint(sorted(list(dd[k].items()),
+key=lambda e:e[1],reverse=True))

Mercurial > hg > python

comparison unicode_hist.py @ 54:dd63412fc882