view master/src/wecu/generate_file_list.py @ 66:b04870ab3035

don't over-count duplicate URIs in multiple properties, produce composite keys instead
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 04 Jun 2020 16:10:55 +0000
parents a3edba8dab11
children
line wrap: on
line source

#!/bin/python

from urllib.request import Request, urlopen, URLopener
import sys
import json
import subprocess
import os


def get_list_of_crawls():
    req = Request('https://index.commoncrawl.org/collinfo.json')
    resp = urlopen(req).read().decode('utf-8')
    crawls_obj = json.loads(resp)

    crawls = []
    for c in crawls_obj:
        crawls.append((c['name'], c['id']))

    return crawls

def print_crawls(crawls):
    i = 0
    for c in crawls:
        if 'ARC' not in c[0]:
            print('{}. {}'.format(i, c[0].replace('Index', '')))
            i += 1

def choose_crawl_and_download_paths():
    crawls = get_list_of_crawls()
    print("Select a crawl [0-{}]:".format(len(crawls)))
    print_crawls(crawls)
    try:
        crawl_no = int(input("Crawl number [0-{}]:".format(len(crawls))))
    except:
        print('Error: Enter a valid crawl number')
        sys.exit(1)

    file_type = input("File Type [wat/wet/warc]:").lower()

    if file_type not in ['warc', 'wat', 'wet']:
        print("Error: Enter a valid file type")
        sys.exit(1)

    url_to_fetch = "https://commoncrawl.s3.amazonaws.com/crawl-data/{}/{}.paths.gz".format(crawls[crawl_no][1], file_type)
    path_file_opener = URLopener()
    path_file_opener.retrieve(url_to_fetch, "paths.gz")

    subprocess.check_output(['gunzip', '--force', 'paths.gz'])

    return crawls[crawl_no][0]

########
# MAIN #
########


def main():
    name_of_crawl = choose_crawl_and_download_paths()

    full_size_str = input("Full crawl or sample? [f/s]: ")
    if full_size_str == 'f':
        os.system('mv paths input_paths')
    else:
        #Sample selection
        subset_size = 0
        try:
            subset_size = int(input("Choose subset size: "))
        except:
            print("Enter a valid integer")
            sys.exit(1)

        random_str = input("Random sample? [y/n]: ")
        if random_str == 'y':
            os.system('shuf -n {} paths > input_paths'.format(subset_size))
        else:
            subprocess.call("head -{} paths > input_paths".format(subset_size), shell=True)

    with open("crawl_name.txt", "w+") as cn_f:
        cn_f.write(name_of_crawl.replace("Index", ""))