changeset 201:3406742894fc

compute (component) uri lengths and a few other properties
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 05 Dec 2023 10:35:15 +0000
parents 0dd36f071b1d
children 10ff891fd656
files lib/python/cc/lmh/ulens.py
diffstat 1 files changed, 16 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/ulens.py	Tue Dec 05 10:35:15 2023 +0000
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+'''extract URI length details from 4-column date-sorted data'''
+
+import sys
+from urllib.parse import urlparse
+
+with open(sys.argv[1],'rb') as f:
+  for l in f:
+    _,_,u,_ = l.split()
+    uu = urlparse(u)
+    print(len(u),
+          *(len(p) for p in uu),
+          1 if b'xn--' in uu.netloc else 0,
+          uu.path.count(b'%'),
+          uu.query.count(b'%'),
+          sep='\t')