comparison master/wecu/generate_file_list.py @ 57:ac1a20e627a9

from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted, sac not quite working yet
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 27 May 2020 20:54:34 +0000
parents
children
comparison
equal deleted inserted replaced
56:8ce6a81e2bb4 57:ac1a20e627a9
1 #!/bin/python
2
3 from urllib.request import Request, urlopen, URLopener
4 import sys
5 import json
6 import subprocess
7 import os
8
9
10 def get_list_of_crawls():
11 req = Request('https://index.commoncrawl.org/collinfo.json')
12 resp = urlopen(req).read().decode('utf-8')
13 crawls_obj = json.loads(resp)
14
15 crawls = []
16 for c in crawls_obj:
17 crawls.append((c['name'], c['id']))
18
19 return crawls
20
21 def print_crawls(crawls):
22 i = 0
23 for c in crawls:
24 if 'ARC' not in c[0]:
25 print('{}. {}'.format(i, c[0].replace('Index', '')))
26 i += 1
27
28 def choose_crawl_and_download_paths():
29 crawls = get_list_of_crawls()
30 print("Select a crawl [0-{}]:".format(len(crawls)))
31 print_crawls(crawls)
32 try:
33 crawl_no = int(input("Crawl number [0-{}]:".format(len(crawls))))
34 except:
35 print('Error: Enter a valid crawl number')
36 sys.exit(1)
37
38 file_type = input("File Type [wat/wet/warc]:").lower()
39
40 if file_type not in ['warc', 'wat', 'wet']:
41 print("Error: Enter a valid file type")
42 sys.exit(1)
43
44 url_to_fetch = "https://commoncrawl.s3.amazonaws.com/crawl-data/{}/{}.paths.gz".format(crawls[crawl_no][1], file_type)
45 path_file_opener = URLopener()
46 path_file_opener.retrieve(url_to_fetch, "paths.gz")
47
48 subprocess.check_output(['gunzip', '--force', 'paths.gz'])
49
50 return crawls[crawl_no][0]
51
52 ########
53 # MAIN #
54 ########
55
56
57 def main():
58 name_of_crawl = choose_crawl_and_download_paths()
59
60 full_size_str = input("Full crawl or sample? [f/s]: ")
61 if full_size_str == 'f':
62 os.system('mv paths input_paths')
63 else:
64 #Sample selection
65 subset_size = 0
66 try:
67 subset_size = int(input("Choose subset size: "))
68 except:
69 print("Enter a valid integer")
70 sys.exit(1)
71
72 random_str = input("Random sample? [y/n]: ")
73 if random_str == 'y':
74 os.system('shuf -n {} paths > input_paths'.format(subset_size))
75 else:
76 subprocess.call("head -{} paths > input_paths".format(subset_size), shell=True)
77
78 with open("crawl_name.txt", "w+") as cn_f:
79 cn_f.write(name_of_crawl.replace("Index", ""))