Mercurial > hg > cc > cirrus_home
changeset 0:65a56c0d1c1f
bolting the barn door...
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 18 Feb 2020 13:15:05 +0000 |
parents | |
children | a4b0359456bc |
files | .bashrc .emacs bin/doPlinks.sh bin/plinks.py bin/plinks.sh listHdrsJob.sh plinksJob.sh |
diffstat | 7 files changed, 149 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.bashrc Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,51 @@ +# .bashrc + +# Source global definitions +if [ -f /etc/bashrc ]; then + . /etc/bashrc +fi + +# Uncomment the following line if you don't like systemctl's auto-paging feature: +# export SYSTEMD_PAGER= + +# User specific aliases and functions +module add gnu-parallel +export N1=rli4n35 +export N2=rli6n35 +export NN="$N1 $N2" + +# Tools +function lss() { + /bin/ls -l "$@" | sort -nr -k 5,5 + } + +function sus() { + sort "$@" | uniq -c | sort -k1nr,1 +} + +function tot () +{ + awk '{sum+=$1} END {printf "%u\n",sum}' +} + +btot () +{ + python3 -c 'import sys +n=0 +for l in sys.stdin: + n+=int(l) +print(n) +' +} + +function typecat () +{ + tt=$(type $1) + case $tt in + *\ is\ hashed\ *) cat $(echo $tt | cut -f 4 -d ' ' | tr -d '()') ;; + *\ is\ /*) cat $(echo $tt | cut -f 3 -d ' ' | tr -d '()') ;; + *) type $1 ;; + esac +} +export HISTSIZE=3000 +export PYTHONPATH=$HOME/lib/python3.6/site-packages
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.emacs Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,18 @@ +;; .emacs + +(custom-set-variables + ;; custom-set-variables was added by Custom. + ;; If you edit it by hand, you could mess it up, so be careful. + ;; Your init file should contain only one such instance. + ;; If there is more than one, they won't work right. + '(diff-switches "-u") + '(text-mode-hook (quote (turn-on-auto-fill text-mode-hook-identify)))) + +;;; uncomment for CJK utf-8 support for non-Asian users +;; (require 'un-define) +(custom-set-faces + ;; custom-set-faces was added by Custom. + ;; If you edit it by hand, you could mess it up, so be careful. + ;; Your init file should contain only one such instance. + ;; If there is more than one, they won't work right. + '(default ((t (:family "DejaVu Sans Mono" :foundry "unknown" :slant normal :weight normal :height 98 :width normal)))))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/doPlinks.sh Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,15 @@ +#!/usr/bin/bash +me=$1 +mine=0 +mkdir /dev/shm/x +while read f +do + if plinks.py $f > /dev/shm/x/links_${me}_${mine} 2>/dev/null + then + ((mine+=1)) + else + echo $f >> /dev/shm/x/badpdfs_$me + rm -f /dev/shm/x/links_${me}_${mine} + fi +done +rsync -a /dev/shm/x/ links
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/plinks.py Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,24 @@ +#!/lustre/sw/miniconda3/bin/python3 +import sys,pdfx,traceback +def run(): + global pdf + try: + pdf=pdfx.PDFx(sys.argv[1]) + links=pdf.get_references_as_dict() + except: + traceback.print_exc() + print("\nFailed: %s"%sys.argv[1],file=sys.stderr) + exit(1) + for k in links.keys(): + for l in links[k]: + print("%s\t%s"%(k,l)) + +if sys.argv[1]=='-t': + import timeit + sys.argv.pop(1) + n=sys.argv[1] + sys.argv.pop(1) + print(timeit.timeit("run()",number=int(n), + setup="from __main__ import run"),file=sys.stderr) +else: + run()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/plinks.sh Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,8 @@ +#!/usr/bin/bash +mkdir -p $TMPDIR +echo $(date) $(hostname) +cd /beegfs/common_crawl/CC-MAIN-2019-35/pdfs +h=$(hostname) +hn=${h##*n} +parallel --will-cite -j 71 --pipepart -a lm/$(hostname)_pdfFilesWithLM doPlinks.sh ${hn}_'{#}' +echo $(date) $(hostname)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/listHdrsJob.sh Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,17 @@ +#!/bin/bash +#PBS -l select=2:ncpus=36 +#PBS -l place=exclhost +#PBS -l walltime=01:00:00 +#PBS -V +#PBS -A dc007 +#PBS -N listHdrs + +module load mpt + +cd ${PBS_O_WORKDIR} + +export MPI_LAUNCH_TIMEOUT=120 +export MPI_SHEPHERD=true + +mpiexec_mpt -ppn 1 -n 1 bin/listHdrs.sh 2019-35 +wait
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plinksJob.sh Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,16 @@ +#!/bin/bash +#PBS -l select=2:ncpus=36 +#PBS -l place=exclhost +#PBS -l walltime=08:00:00 +#PBS -V +#PBS -A dc007 +#PBS -N plinks + +module load mpt + +cd ${PBS_O_WORKDIR} + +export MPI_SHEPHERD=true + +mpiexec_mpt -ppn 1 -n 2 bin/plinks.sh 2019-35 +