Mercurial > hg > cc > cirrus_work
changeset 201:3406742894fc
compute (component) uri lengths and a few other properties
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 05 Dec 2023 10:35:15 +0000 |
parents | 0dd36f071b1d |
children | 10ff891fd656 |
files | lib/python/cc/lmh/ulens.py |
diffstat | 1 files changed, 16 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/ulens.py Tue Dec 05 10:35:15 2023 +0000 @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +'''extract URI length details from 4-column date-sorted data''' + +import sys +from urllib.parse import urlparse + +with open(sys.argv[1],'rb') as f: + for l in f: + _,_,u,_ = l.split() + uu = urlparse(u) + print(len(u), + *(len(p) for p in uu), + 1 if b'xn--' in uu.netloc else 0, + uu.path.count(b'%'), + uu.query.count(b'%'), + sep='\t')