diff master/wecu/generate_file_list.py @ 57:ac1a20e627a9

from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted, sac not quite working yet
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 27 May 2020 20:54:34 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/master/wecu/generate_file_list.py	Wed May 27 20:54:34 2020 +0000
@@ -0,0 +1,79 @@
+#!/bin/python
+
+from urllib.request import Request, urlopen, URLopener
+import sys
+import json
+import subprocess
+import os
+
+
+def get_list_of_crawls():
+    req = Request('https://index.commoncrawl.org/collinfo.json')
+    resp = urlopen(req).read().decode('utf-8')
+    crawls_obj = json.loads(resp)
+
+    crawls = []
+    for c in crawls_obj:
+        crawls.append((c['name'], c['id']))
+
+    return crawls
+
+def print_crawls(crawls):
+    i = 0
+    for c in crawls:
+        if 'ARC' not in c[0]:
+            print('{}. {}'.format(i, c[0].replace('Index', '')))
+            i += 1
+
+def choose_crawl_and_download_paths():
+    crawls = get_list_of_crawls()
+    print("Select a crawl [0-{}]:".format(len(crawls)))
+    print_crawls(crawls)
+    try:
+        crawl_no = int(input("Crawl number [0-{}]:".format(len(crawls))))
+    except:
+        print('Error: Enter a valid crawl number')
+        sys.exit(1)
+
+    file_type = input("File Type [wat/wet/warc]:").lower()
+
+    if file_type not in ['warc', 'wat', 'wet']:
+        print("Error: Enter a valid file type")
+        sys.exit(1)
+
+    url_to_fetch = "https://commoncrawl.s3.amazonaws.com/crawl-data/{}/{}.paths.gz".format(crawls[crawl_no][1], file_type)
+    path_file_opener = URLopener()
+    path_file_opener.retrieve(url_to_fetch, "paths.gz")
+
+    subprocess.check_output(['gunzip', '--force', 'paths.gz'])
+
+    return crawls[crawl_no][0]
+
+########
+# MAIN #
+########
+
+
+def main():
+    name_of_crawl = choose_crawl_and_download_paths()
+
+    full_size_str = input("Full crawl or sample? [f/s]: ")
+    if full_size_str == 'f':
+        os.system('mv paths input_paths')
+    else:
+        #Sample selection
+        subset_size = 0
+        try:
+            subset_size = int(input("Choose subset size: "))
+        except:
+            print("Enter a valid integer")
+            sys.exit(1)
+
+        random_str = input("Random sample? [y/n]: ")
+        if random_str == 'y':
+            os.system('shuf -n {} paths > input_paths'.format(subset_size))
+        else:
+            subprocess.call("head -{} paths > input_paths".format(subset_size), shell=True)
+
+    with open("crawl_name.txt", "w+") as cn_f:
+        cn_f.write(name_of_crawl.replace("Index", ""))