comparison unicode_hist.py @ 54:dd63412fc882

fix something???
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 29 May 2023 22:03:08 +0100
parents 6faea25a69b3
children
comparison
equal deleted inserted replaced
53:91d71e9760e8 54:dd63412fc882
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*- 2 # -*- coding: utf-8 -*-
3 3 # from https://github.com/usc-isi-i2/dig-unicode/blob/master/python/unicode-histogram.py
4 import sys 4 import sys
5 import re 5 import re, pprint
6 try: 6 try:
7 import simplejson as json 7 import simplejson as json
8 except: 8 except:
9 import json 9 import json
10 10
12 import unicodedata 12 import unicodedata
13 13
14 from time import strftime, gmtime 14 from time import strftime, gmtime
15 15
16 """ 16 """
17 12 December 2014
18 for each of {body, title}:
19 the unicodeSignature is the sequence of >ascii codepoints, in order, space-separated
20 the unicodeCatalog is the bag of >ascii codepoints, sorted/agglomerated using space, comma-separated
21 the unicodeHistogram is a json-encoded python dict/json object mapping codepoint to count
22
23 the unicodeBlockSignature is the sequence of block descriptors (of all >ascii), in order, space-separated
24 the unicodeBlockCatalog is the bag of block descriptors, sorted/agglomerated using space, comma-separated
25 the unicodeBlockHistogram is a json-encoded python dict/json object mapping block descriptor to count 17 the unicodeBlockHistogram is a json-encoded python dict/json object mapping block descriptor to count
26 18
27 the unicodeCategorySignature is the sequence of category descriptors (of all >ascii), in order, space-separated
28 the unicodeCategoryCatalog is the bag of category descriptors, sorted/agglomerated using space, comma-separated
29 the unicodeCategoryHistogram is a json-encoded python dict/json object mapping category descriptor to count 19 the unicodeCategoryHistogram is a json-encoded python dict/json object mapping category descriptor to count
30 20
31 where block and category descriptors are defined via 21 where block and category descriptors are defined via
32 # From http://stackoverflow.com/a/245072 22 # From http://stackoverflow.com/a/245072
33 # retrieved from http://unicode.org/Public/UNIDATA/Blocks.txt 23 # retrieved from http://unicode.org/Public/UNIDATA/Blocks.txt
48 def fmtCodepoint(codepoint, style): 38 def fmtCodepoint(codepoint, style):
49 return codepoint 39 return codepoint
50 40
51 def fmtMetadatum(metadatum, style): 41 def fmtMetadatum(metadatum, style):
52 def fmtValue(s): 42 def fmtValue(s):
53 return re.sub("[ -]", "_", re.sub(",", "", s)) 43 return re.sub("[ -]", "_", re.sub(",", "", str(s)))
54 44
55 if style=="category": 45 if style=="category":
56 category = categoryCodeDescription(unicodedata.category(metadatum)) 46 category = categoryCodeDescription(unicodedata.category(metadatum))
57 # return "category:" + fmtValue(category) 47 # return "category:" + fmtValue(category)
58 return fmtValue(category) 48 return fmtValue(category)
290 'Tamil' 280 'Tamil'
291 >>> block(unichr(0xe0080)) 281 >>> block(unichr(0xe0080))
292 282
293 ''' 283 '''
294 284
295 #assert isinstance(ch, unicode) and len(ch) == 1, repr(ch) 285 assert isinstance(ch, str) and len(ch) == 1, repr(ch)
296 cp = ord(ch) 286 cp = ord(ch)
297 for start, end, name in _blocks: 287 for start, end, name in _blocks:
298 if start <= cp <= end: 288 if start <= cp <= end:
299 return name 289 return name
300 290
334 'Zs': "Separator, Space"} 324 'Zs': "Separator, Space"}
335 325
336 def categoryCodeDescription(category): 326 def categoryCodeDescription(category):
337 return categoryCodeDescriptions.get(category, "Not Available") 327 return categoryCodeDescriptions.get(category, "Not Available")
338 328
339 def analyze(part): 329 codepointHisto = Counter()
340 content = part["text"] 330 categoryHisto = Counter()
341 codepointSeq = [] 331 blockHisto = Counter()
342 categorySeq = [] 332 dd={'codepoints':codepointHisto,
343 blockSeq = [] 333 'categories':categoryHisto,
344 codepointHisto = Counter() 334 'blocks':blockHisto}
345 categoryHisto = Counter() 335
346 blockHisto = Counter() 336 def analyze(content):
347 for c in content: 337 for c in content:
348 if not isAscii(c): 338 codepointHisto[c] += 1
349 codepointHisto[c] += 1 339 cat = fmtMetadatum(c, 'category')
350 codepointSeq.append(c) 340 blk = fmtMetadatum(c, 'block')
351 cat = fmtMetadatum(c, 'category') 341 if cat:
352 blk = fmtMetadatum(c, 'block') 342 categoryHisto[cat] += 1
353 if cat: 343 if blk:
354 categoryHisto[cat] += 1 344 blockHisto[blk] += 1
355 categorySeq.append(cat) 345 return (codepointHisto, categoryHisto, blockHisto)
356 if blk:
357 blockHisto[blk] += 1
358 blockSeq.append(blk)
359 # Normal form KD
360 # presumed of minor importance: omitted for now
361 # categoryHisto["normalized:" + unicodedata.normalize(c.decode('utf-8'),'NFKD')] += 1
362 contentElements = codepointSeq
363 # Histogram: JSON-encoded string repn of the dict
364 part["unicodeHistogram"] = json.dumps(codepointHisto)
365 # Signature: sequence of codepoints
366 part["unicodeSignature"] = " ".join(codepointSeq)
367 # Catalog: bag of codepoints
368 codepointCatalogElements = []
369 for k in sorted(codepointHisto.keys()):
370 v = codepointHisto[k]
371 # v copies of this key
372 codepointCatalogElements.append(" ".join([k for _ in range(v)]))
373 part["unicodeCatalog"] = ", ".join(codepointCatalogElements)
374
375 # Histogram: JSON-encoded string repn of the dict
376 part["unicodeCategoryHistogram"] = json.dumps(categoryHisto)
377 # Signature: sequence of codepoints
378 part["unicodeCategorySignature"] = " ".join(categorySeq)
379 # Catalog: bag of categories
380 categoryCatalogElements = []
381 for k in sorted(categoryHisto.keys()):
382 v = categoryHisto[k]
383 # v copies of this key
384 categoryCatalogElements.append(" ".join([k for _ in range(v)]))
385 part["unicodeCategoryCatalog"] = ", ".join(categoryCatalogElements)
386
387 # Histogram: JSON-encoded string repn of the dict
388 part["unicodeBlockHistogram"] = json.dumps(blockHisto)
389 # Signature: sequence of codepoints
390 part["unicodeBlockSignature"] = " ".join(blockSeq)
391 # Catalog: bag of blocks
392 blockCatalogElements = []
393 for k in sorted(blockHisto.keys()):
394 v = blockHisto[k]
395 # v copies of this key
396 blockCatalogElements.append(" ".join([k for _ in range(v)]))
397 part["unicodeBlockCatalog"] = ", ".join(blockCatalogElements)
398
399 return part
400 346
401 #Test data 347 #Test data
402 HEART = u'\u2665' 348 # HEART = '\u2665'
403 SMILY = u'\u263a' 349 # SMILY = '\u263a'
404 TSU = u'\u30C4' 350 # TSU = '\u30C4'
405 LEFT = u'\u27E8' 351 # LEFT = '\u27E8'
406 RIGHT = u'\u27E9' 352 # RIGHT = '\u27E9'
407 EURO = u'\u20AC' 353 # EURO = '\u20AC'
408 354
409 if True: 355 # if True:
410 356
411 TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT 357 # TESTUNICODE = LEFT + "h" + EURO + "llo " + HEART + HEART + SMILY + TSU + " goodby" + EURO + " " + SMILY + TSU + HEART + HEART + HEART + HEART + RIGHT
412 358
413 print(len(TESTUNICODE)) 359 # print(len(TESTUNICODE))
414 print(json.dumps(TESTUNICODE)) 360 # print(json.dumps(TESTUNICODE))
415 361
416 TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"} 362 # TESTDOC = {"@context": "http://localhost:8080/publish/JSON/WSP1WS6-select unix_timestamp(a_importtime)*1000 as timestamp, a_* from ads a join sample s on a_id=s_id limit 50-context.json","schema:provider": {"a": "Organization", "uri": "http://memex.zapto.org/data/organization/1"}, "snapshotUri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/raw","a": "WebPage","dateCreated": "2013-09-24T18:28:00","hasBodyPart": {"text": TESTUNICODE, "a": "WebPageElement"}, "hasTitlePart": {"text": "\u270b\u270b\u270bOnly Best \u270c\u270c\u270c Forget The Rest \u270b\u270b\u270b Outcall Specials TONIGHT \u270c\ud83d\udc8b\ud83d\udc45 Sexy Blonde is UP LATE \ud83d\udc9c\ud83d\udc9b\u270b\u270c - 25", "a": "WebPageElement"}, "uri": "http://memex.zapto.org/data/page/850753E7323B188B93E6E28F730F2BFBFB1CE00B/1396493689000/processed"}
417 363
418 analyze(TESTDOC["hasBodyPart"]) 364 # res=analyze(TESTDOC["hasBodyPart"]["text"])
419 json.dump(TESTDOC, sys.stdout, indent=4); 365 # print(res)
420 exit(0) 366 # exit(0)
367
421 368
422 for line in sys.stdin: 369 for line in sys.stdin:
423 try: 370 try:
424 (url, jrep) = line.split('\t') 371 analyze(line)
425 d = json.loads(jrep)
426
427 analyze(d["hasBodyPart"])
428 analyze(d["hasTitlePart"])
429 # insert gmtime
430 # ensure it doesn't collide with any other gentime
431 d["unicodeGentime"] = gentime()
432
433 print(url + "\t",end="")
434 json.dump(d, sys.stdout, sort_keys=True)
435 print()
436 except ValueError as e: 372 except ValueError as e:
437 print(e,file=sys.stderr) 373 print(e,file=sys.stderr)
438 pass 374 pass
375 for k in dd.keys():
376 print(k)
377 pprint.pprint(sorted(list(dd[k].items()),
378 key=lambda e:e[1],reverse=True))
379
380