Mercurial > hg > cc > azure
comparison master/wecu/generate_file_list.py @ 57:ac1a20e627a9
from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
sac not quite working yet
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 27 May 2020 20:54:34 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
56:8ce6a81e2bb4 | 57:ac1a20e627a9 |
---|---|
1 #!/bin/python | |
2 | |
3 from urllib.request import Request, urlopen, URLopener | |
4 import sys | |
5 import json | |
6 import subprocess | |
7 import os | |
8 | |
9 | |
10 def get_list_of_crawls(): | |
11 req = Request('https://index.commoncrawl.org/collinfo.json') | |
12 resp = urlopen(req).read().decode('utf-8') | |
13 crawls_obj = json.loads(resp) | |
14 | |
15 crawls = [] | |
16 for c in crawls_obj: | |
17 crawls.append((c['name'], c['id'])) | |
18 | |
19 return crawls | |
20 | |
21 def print_crawls(crawls): | |
22 i = 0 | |
23 for c in crawls: | |
24 if 'ARC' not in c[0]: | |
25 print('{}. {}'.format(i, c[0].replace('Index', ''))) | |
26 i += 1 | |
27 | |
28 def choose_crawl_and_download_paths(): | |
29 crawls = get_list_of_crawls() | |
30 print("Select a crawl [0-{}]:".format(len(crawls))) | |
31 print_crawls(crawls) | |
32 try: | |
33 crawl_no = int(input("Crawl number [0-{}]:".format(len(crawls)))) | |
34 except: | |
35 print('Error: Enter a valid crawl number') | |
36 sys.exit(1) | |
37 | |
38 file_type = input("File Type [wat/wet/warc]:").lower() | |
39 | |
40 if file_type not in ['warc', 'wat', 'wet']: | |
41 print("Error: Enter a valid file type") | |
42 sys.exit(1) | |
43 | |
44 url_to_fetch = "https://commoncrawl.s3.amazonaws.com/crawl-data/{}/{}.paths.gz".format(crawls[crawl_no][1], file_type) | |
45 path_file_opener = URLopener() | |
46 path_file_opener.retrieve(url_to_fetch, "paths.gz") | |
47 | |
48 subprocess.check_output(['gunzip', '--force', 'paths.gz']) | |
49 | |
50 return crawls[crawl_no][0] | |
51 | |
52 ######## | |
53 # MAIN # | |
54 ######## | |
55 | |
56 | |
57 def main(): | |
58 name_of_crawl = choose_crawl_and_download_paths() | |
59 | |
60 full_size_str = input("Full crawl or sample? [f/s]: ") | |
61 if full_size_str == 'f': | |
62 os.system('mv paths input_paths') | |
63 else: | |
64 #Sample selection | |
65 subset_size = 0 | |
66 try: | |
67 subset_size = int(input("Choose subset size: ")) | |
68 except: | |
69 print("Enter a valid integer") | |
70 sys.exit(1) | |
71 | |
72 random_str = input("Random sample? [y/n]: ") | |
73 if random_str == 'y': | |
74 os.system('shuf -n {} paths > input_paths'.format(subset_size)) | |
75 else: | |
76 subprocess.call("head -{} paths > input_paths".format(subset_size), shell=True) | |
77 | |
78 with open("crawl_name.txt", "w+") as cn_f: | |
79 cn_f.write(name_of_crawl.replace("Index", "")) |