changeset 54:dd63412fc882

fix something???
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 29 May 2023 22:03:08 +0100
parents 91d71e9760e8
children 68004ce55703 bc1acb1416ab
files unicode_hist.py
diffstat 1 files changed, 42 insertions(+), 100 deletions(-) [+]
line wrap: on
line diff
--- a/unicode_hist.py	Mon May 29 22:02:52 2023 +0100
+++ b/unicode_hist.py	Mon May 29 22:03:08 2023 +0100
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-
+# from https://github.com/usc-isi-i2/dig-unicode/blob/master/python/unicode-histogram.py
 import sys
-import re
+import re, pprint
 try:
     import simplejson as json
 except:
@@ -14,18 +14,8 @@
 from time import strftime, gmtime
 
 """
-12 December 2014
-for each of {body, title}:
-  the unicodeSignature is the sequence of >ascii codepoints, in order, space-separated
-  the unicodeCatalog is the bag of >ascii codepoints, sorted/agglomerated using space, comma-separated
-  the unicodeHistogram is a json-encoded python dict/json object mapping codepoint to count
-
-  the unicodeBlockSignature is the sequence of block descriptors (of all >ascii), in order, space-separated
-  the unicodeBlockCatalog is the bag of block descriptors, sorted/agglomerated using space, comma-separated
   the unicodeBlockHistogram is a json-encoded python dict/json object mapping block descriptor to count
 
-  the unicodeCategorySignature is the sequence of category descriptors (of all >ascii), in order, space-separated
-  the unicodeCategoryCatalog is the bag of category descriptors, sorted/agglomerated using space, comma-separated
   the unicodeCategoryHistogram is a json-encoded python dict/json object mapping category descriptor to count
 
   where block and category descriptors are defined via
@@ -50,7 +40,7 @@
 
 def fmtMetadatum(metadatum, style):
     def fmtValue(s):
-        return re.sub("[ -]", "_", re.sub(",", "", s))
+        return re.sub("[ -]", "_", re.sub(",", "", str(s)))
 
     if style=="category":
         category = categoryCodeDescription(unicodedata.category(metadatum))
@@ -292,7 +282,7 @@
     
     '''
 
-    #assert isinstance(ch, unicode) and len(ch) == 1, repr(ch)
+    assert isinstance(ch, str) and len(ch) == 1, repr(ch)
     cp = ord(ch)
     for start, end, name in _blocks:
         if start <= cp <= end:
@@ -336,103 +326,55 @@
 def categoryCodeDescription(category):
     return categoryCodeDescriptions.get(category, "Not Available")
 
-def analyze(part):
-    content = part["text"]
-    codepointSeq = []
-    categorySeq = []
-    blockSeq = []
-    codepointHisto = Counter()
-    categoryHisto = Counter()
-    blockHisto = Counter()
+codepointHisto = Counter()
+categoryHisto = Counter()
+blockHisto = Counter()
+dd={'codepoints':codepointHisto,
+    'categories':categoryHisto,
+    'blocks':blockHisto}
+
+def analyze(content):
     for c in content:
-        if not isAscii(c):
-            codepointHisto[c] += 1
-            codepointSeq.append(c)
-            cat = fmtMetadatum(c, 'category')
-            blk = fmtMetadatum(c, 'block')
-            if cat:
-                categoryHisto[cat] += 1
-                categorySeq.append(cat)
-            if blk:
-                blockHisto[blk] += 1
-                blockSeq.append(blk)
-            # Normal form KD
-            # presumed of minor importance: omitted for now
-            # categoryHisto["normalized:" + unicodedata.normalize(c.decode('utf-8'),'NFKD')] += 1
-    contentElements = codepointSeq
-    # Histogram: JSON-encoded string repn of the dict
-    part["unicodeHistogram"] = json.dumps(codepointHisto)
-    # Signature: sequence of codepoints
-    part["unicodeSignature"] = " ".join(codepointSeq)
-    # Catalog: bag of codepoints
-    codepointCatalogElements = []
-    for k in sorted(codepointHisto.keys()):
-        v = codepointHisto[k]
-        # v copies of this key
-        codepointCatalogElements.append(" ".join([k for _ in range(v)]))
-    part["unicodeCatalog"] = ", ".join(codepointCatalogElements)
-
-    # Histogram: JSON-encoded string repn of the dict
-    part["unicodeCategoryHistogram"] = json.dumps(categoryHisto)
-    # Signature: sequence of codepoints
-    part["unicodeCategorySignature"] = " ".join(categorySeq)
-    # Catalog: bag of categories
-    categoryCatalogElements = []
-    for k in sorted(categoryHisto.keys()):
-        v = categoryHisto[k]
-        # v copies of this key
-        categoryCatalogElements.append(" ".join([k for _ in range(v)]))
-    part["unicodeCategoryCatalog"] = ", ".join(categoryCatalogElements)
-
-    # Histogram: JSON-encoded string repn of the dict
-    part["unicodeBlockHistogram"] = json.dumps(blockHisto)
-    # Signature: sequence of codepoints
-    part["unicodeBlockSignature"] = " ".join(blockSeq)
-    # Catalog: bag of blocks
-    blockCatalogElements = []
-    for k in sorted(blockHisto.keys()):
-        v = blockHisto[k]
-        # v copies of this key
-        blockCatalogElements.append(" ".join([k for _ in range(v)]))
-    part["unicodeBlockCatalog"] = ", ".join(blockCatalogElements)
-
-    return part
+        codepointHisto[c] += 1
+        cat = fmtMetadatum(c, 'category')
+        blk = fmtMetadatum(c, 'block')
+        if cat:
+            categoryHisto[cat] += 1
+        if blk:
+            blockHisto[blk] += 1
+    return (codepointHisto, categoryHisto, blockHisto)
 
 #Test data
-HEART = u'\u2665'
-SMILY = u'\u263a'
-TSU = u'\u30C4'
-LEFT = u'\u27E8'
-RIGHT = u'\u27E9'
-EURO = u'\u20AC'
+# HEART = '\u2665'
+# SMILY = '\u263a'
+# TSU = '\u30C4'
+# LEFT = '\u27E8'
+# RIGHT = '\u27E9'
+# EURO = '\u20AC'
 
-if True:
+# if True:
 
-   TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT
+#    TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT
 
-   print(len(TESTUNICODE))
-   print(json.dumps(TESTUNICODE))
+#    print(len(TESTUNICODE))
+#    print(json.dumps(TESTUNICODE))
 
-   TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"}
+#    TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"}
 
-   analyze(TESTDOC["hasBodyPart"])
-   json.dump(TESTDOC, sys.stdout, indent=4);
-   exit(0)
+#    res=analyze(TESTDOC["hasBodyPart"]["text"])
+#    print(res)
+#    exit(0)
+
 
 for line in sys.stdin:
     try:
-        (url, jrep) = line.split('\t')
-        d = json.loads(jrep)
-
-        analyze(d["hasBodyPart"])
-        analyze(d["hasTitlePart"])
-        # insert gmtime
-        # ensure it doesn't collide with any other gentime
-        d["unicodeGentime"] = gentime()
-
-        print(url + "\t",end="")
-        json.dump(d, sys.stdout, sort_keys=True)
-        print()
+        analyze(line)
     except ValueError as e:
         print(e,file=sys.stderr)
         pass
+for k in dd.keys():
+    print(k)
+    pprint.pprint(sorted(list(dd[k].items()),
+                         key=lambda e:e[1],reverse=True))
+
+