annotate master/src/wecu/wecu.py @ 61:cfaf5223b071

trying to get my own mapper working
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 31 May 2020 12:06:44 +0000
parents 5fdca5baa4e9
children 892e1c0240e1
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/usr/bin/python3
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
2 import os
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
3 import sys
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 import argparse
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6 HOME=os.getenv('HOME')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
7 DEFAULT_HOSTS='hosts'
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
8 DEFAULT_CORES='cores.txt'
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 DEFAULT_WD=os.getcwd()
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
10
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 try:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
12 with open('%s/.wecu'%HOME) as conf:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
13 for l in conf:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
14 if l[0]!='#':
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
15 eval(l.rstrip())
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
16 except IOError as e:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
17 raise e
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
18 pass
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
19
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
20
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
21 HOSTS_FILEPATH=os.getenv('WECU_HOSTS',DEFAULT_HOSTS)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
22 CORES_FILEPATH=os.getenv('WECU_CORES',DEFAULT_CORES)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
23 WORK_DIR=os.getenv('WECU_WD',DEFAULT_WD)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
24
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
25
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
26 def setup_handler(args):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
27 if args.check_files:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
28 if os.path.exists(HOSTS_FILEPATH) and os.path.exists(CORES_FILEPATH):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
29 print("Config ok!")
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
30 else:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
31 print("WECU is not configured! Run `wecu setup`")
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
32
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
33 return
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
34
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
35 if args.check_nodes_up:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
36 command = 'cat %s | '%HOSTS_FILEPATH
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
37 command += 'parallel --will-cite '
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
38 command += '"nc -z {} 22 2> /dev/null; '
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
39 command += 'if [ $? -eq 0 ]; '
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
40 command += 'then '
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
41 command += ' echo Node {} is OK; '
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
42 command += 'else'
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
43 command += ' echo Node {} is down; '
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
44 command += 'fi"'
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
45
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
46 os.system(command)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
47 return
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
48
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
49 # Perform config
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
50 os.system('setup.sh {}'.format(args.password))
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
51
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
52 def list_handler(args):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
53 if args.object_to_list == 'machines':
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
54 os.system('cat %s'%HOSTS_FILEPATH)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
55 print('')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
56 elif args.object_to_list == 'input_files':
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
57 if args.all:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
58 os.system('cat input_paths')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
59 return
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
60
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
61 print('Crawl name:')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
62 os.system('cat crawl_name.txt')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
63 print
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
64 print('Number of input files:')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
65 os.system('wc -l < input_paths')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
66
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
67 def execute_handler(args):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
68 command = "time parallel --onall "
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
69 command += "--sshloginfile %s "%HOSTS_FILEPATH
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
70 command += "--retries 3 "
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
71
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
72 to_run_on_remote = ""
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
73
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
74 if args.transfer_file:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
75 for filename in args.transfer_file:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
76 command += "--transferfile {} ".format(filename)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
77
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
78 to_run_on_remote += "chmod +x {}; ".format(filename)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
79
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
80 to_run_on_remote += args.command
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
81
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
82 command += "--will-cite "
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
83 command += "--workdir %s "%WORK_DIR
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
84 command += "eval ::: '{}' 2>&1 | grep -v \"Authorized uses only\"\n".format(to_run_on_remote)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
85
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
86 os.system('bash -c "%s"'%command)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
87
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
88 def num_cores(args):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
89 if args.jobs_per_worker is None:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
90 with open(CORES_FILEPATH) as cf:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
91 return cf.readline().rstrip()
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
92 else:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
93 return args.jobs_per_worker
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
94
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
95 def mapred_handler(args):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
96 cores = num_cores(args)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
97 run_mapred(args.mapper, args.reducer, cores)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
98
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
99 def sac_handler(args):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
100 regex_str = 'false'
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
101 if args.regex:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
102 regex_str = 'true'
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
103
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
104 patterns_str = ' '.join(['"{}"'.format(x) for x in args.pattern])
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
105
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
106 cores_per_worker = num_cores(args)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
107
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
108 os.system('run_sac.sh {} {} {} {} {} {} {} {}'.format(
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
109 cores_per_worker,
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
110 HOSTS_FILEPATH,
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
111 WORK_DIR,
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
112 ('sac_mapper.py' if args.mapper is None
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
113 else args.mapper),
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
114 ('' if args.filter is None
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
115 else "-f '%s'"%args.filter),
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
116 ('by-file' if args.by_file
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
117 else 'aggregate'),
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
118 regex_str,
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
119 patterns_str))
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
120
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
121 def generate_handler(args):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
122 import generate_file_list
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
123 generate_file_list.main()
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
124
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
125 def utilization_handler(args):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
126 from graph_hardware_usage import generate_hardware_graph
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
127
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
128 duration_seconds = 120
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
129 if args.seconds:
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
130 duration_seconds = args.seconds
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
131
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
132 os.system('./get_hardware_util.sh {}'.format(duration_seconds))
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
133 generate_hardware_graph(args.output_graph_filename)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
134
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
135 def run_mapred(mapper, reducer, cores):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
136
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
137 os.system('./run_mapreduce.sh {} {} {}'.format(cores_per_worker, mapper, reducer))
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
138
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
139 # Top-level parser
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
140 parser = argparse.ArgumentParser(description='Wee CommonCrawl Utility (WECU) is a CLI tool which allows running scan-and-count workloads on Common Crawl data without')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
141 subparsers = parser.add_subparsers(help='A sub-command to be executed')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
142
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
143 # Cluster Setup parser
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
144 parser_setup = subparsers.add_parser('setup', help='Setup the framework to operate on an HDInsight cluster')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
145 parser_setup.add_argument('password', type=str, help='Password to the cluster - used to setup passwordless communication')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
146 parser_setup.add_argument('--check_files', action="store_true", help='Use this flag to check that all the required configuration files are in place')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
147 parser_setup.add_argument('--check_nodes_up', action='store_true', help='Check if the worker machines are responsive')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
148 parser_setup.set_defaults(handler=setup_handler)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
149
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
150 # Show cluster configuration parser
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
151 parser_list = subparsers.add_parser('list', help='List configuration')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
152 parser_list.add_argument('object_to_list', type=str, choices=['machines', 'input_files'], help='Choose whether to list machines or (summary) of currently selected input files')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
153 parser_list.add_argument('--all', action='store_true', help='Show a list of all input files instead of a summary (Can be used alongside input_files argument only)')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
154 parser_list.set_defaults(handler=list_handler)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
155
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
156 # Remote command execution parser
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
157 execute_list = subparsers.add_parser('execute', help='Execute arbitrary command on all worker machines in the cluster in parallel')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
158 execute_list.add_argument('command', type=str)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
159 execute_list.add_argument('--transfer_file', nargs='+', help="Provide files which should be transferred to the remote workers before the executions starts.")
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
160 execute_list.set_defaults(handler=execute_handler)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
161
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
162 # MapReduce jobs parser
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
163 mapred_list = subparsers.add_parser('mapred', help="Execute mapreduce jobs using the provided mapper and reducer executable")
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
164 mapred_list.add_argument('mapper', type=str, help='Path to the map phase executable')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
165 mapred_list.add_argument('reducer', type=str, help='Path to the reduce phase executable')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
166 mapred_list.add_argument('--jobs-per-worker', type=int, help="By deafult the number of concurrent tasks is set to the number of available logical cores. Provide this flag to set a different number of concurrent tasks.")
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
167
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
168 mapred_list.set_defaults(handler=mapred_handler)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
169
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
170 # Scan-and-count parser
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
171 sac_list = subparsers.add_parser('sac', help='Execute scan-and-count (SAC) workloads directly from the command line')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
172 sac_list.add_argument('pattern', type=str, nargs='+')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
173 sac_list.add_argument('--regex', action="store_true", help="Provide this flag to indicate that the provided strings should be treated as regular expressions")
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
174 sac_list.add_argument('--by-file', action="store_true", help="Provide this flag to indicate that the output should not be aggregated and displayed per file instead")
60
5fdca5baa4e9 refactor a bit, add support for sac with bespoke mapper
Henry S. Thompson <ht@markup.co.uk>
parents: 58
diff changeset
175 sac_list.add_argument('--mapper', type=str, help="Supply a bespoke mapper for use in place of sac_mapper.py")
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents: 60
diff changeset
176 sac_list.add_argument('--filter', type=str, help="Supply a filter on the unzipped warc file ahead of the mapper")
57
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
177 sac_list.add_argument('--jobs-per-worker', type=int, help="By deafult the number of concurrent tasks is set to the number of available logical cores. Provide this flag to set a different number of concurrent tasks.")
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
178 sac_list.set_defaults(handler=sac_handler)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
179
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
180 # Generate sample parser
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
181 generate_parser = subparsers.add_parser('generate-sample', help='Generate a sample of a chosen Common Crawl month')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
182 generate_parser.set_defaults(handler=generate_handler)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
183
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
184 # Generate utilization graph parser
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
185 utilization_graph = subparsers.add_parser('utilisation', help='Generate CPU utilisation graph and files')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
186 utilization_graph.add_argument('output_graph_filename', type=str, help='The path to the location of the output graph')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
187 utilization_graph.add_argument('--seconds', type=int, help='Provide this flag to change how long the utilisation is measure for (the default is 120 seconds).')
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
188 utilization_graph.set_defaults(handler=utilization_handler)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
189
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
190 if(len(sys.argv) < 2):
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
191 parser.print_help()
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
192 sys.exit(0)
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
193
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
194 args = parser.parse_args()
ac1a20e627a9 from lukasz git repo 2020-05-26 (see ~/src/wecu), then editted,
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
195 args.handler(args)