# HG changeset patch # User Henry S. Thompson # Date 1582031705 0 # Node ID 65a56c0d1c1fdb8d54407df4a760295d82db9a60 bolting the barn door... diff -r 000000000000 -r 65a56c0d1c1f .bashrc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.bashrc Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,51 @@ +# .bashrc + +# Source global definitions +if [ -f /etc/bashrc ]; then + . /etc/bashrc +fi + +# Uncomment the following line if you don't like systemctl's auto-paging feature: +# export SYSTEMD_PAGER= + +# User specific aliases and functions +module add gnu-parallel +export N1=rli4n35 +export N2=rli6n35 +export NN="$N1 $N2" + +# Tools +function lss() { + /bin/ls -l "$@" | sort -nr -k 5,5 + } + +function sus() { + sort "$@" | uniq -c | sort -k1nr,1 +} + +function tot () +{ + awk '{sum+=$1} END {printf "%u\n",sum}' +} + +btot () +{ + python3 -c 'import sys +n=0 +for l in sys.stdin: + n+=int(l) +print(n) +' +} + +function typecat () +{ + tt=$(type $1) + case $tt in + *\ is\ hashed\ *) cat $(echo $tt | cut -f 4 -d ' ' | tr -d '()') ;; + *\ is\ /*) cat $(echo $tt | cut -f 3 -d ' ' | tr -d '()') ;; + *) type $1 ;; + esac +} +export HISTSIZE=3000 +export PYTHONPATH=$HOME/lib/python3.6/site-packages diff -r 000000000000 -r 65a56c0d1c1f .emacs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.emacs Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,18 @@ +;; .emacs + +(custom-set-variables + ;; custom-set-variables was added by Custom. + ;; If you edit it by hand, you could mess it up, so be careful. + ;; Your init file should contain only one such instance. + ;; If there is more than one, they won't work right. + '(diff-switches "-u") + '(text-mode-hook (quote (turn-on-auto-fill text-mode-hook-identify)))) + +;;; uncomment for CJK utf-8 support for non-Asian users +;; (require 'un-define) +(custom-set-faces + ;; custom-set-faces was added by Custom. + ;; If you edit it by hand, you could mess it up, so be careful. + ;; Your init file should contain only one such instance. + ;; If there is more than one, they won't work right. + '(default ((t (:family "DejaVu Sans Mono" :foundry "unknown" :slant normal :weight normal :height 98 :width normal))))) diff -r 000000000000 -r 65a56c0d1c1f bin/doPlinks.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/doPlinks.sh Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,15 @@ +#!/usr/bin/bash +me=$1 +mine=0 +mkdir /dev/shm/x +while read f +do + if plinks.py $f > /dev/shm/x/links_${me}_${mine} 2>/dev/null + then + ((mine+=1)) + else + echo $f >> /dev/shm/x/badpdfs_$me + rm -f /dev/shm/x/links_${me}_${mine} + fi +done +rsync -a /dev/shm/x/ links diff -r 000000000000 -r 65a56c0d1c1f bin/plinks.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/plinks.py Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,24 @@ +#!/lustre/sw/miniconda3/bin/python3 +import sys,pdfx,traceback +def run(): + global pdf + try: + pdf=pdfx.PDFx(sys.argv[1]) + links=pdf.get_references_as_dict() + except: + traceback.print_exc() + print("\nFailed: %s"%sys.argv[1],file=sys.stderr) + exit(1) + for k in links.keys(): + for l in links[k]: + print("%s\t%s"%(k,l)) + +if sys.argv[1]=='-t': + import timeit + sys.argv.pop(1) + n=sys.argv[1] + sys.argv.pop(1) + print(timeit.timeit("run()",number=int(n), + setup="from __main__ import run"),file=sys.stderr) +else: + run() diff -r 000000000000 -r 65a56c0d1c1f bin/plinks.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/plinks.sh Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,8 @@ +#!/usr/bin/bash +mkdir -p $TMPDIR +echo $(date) $(hostname) +cd /beegfs/common_crawl/CC-MAIN-2019-35/pdfs +h=$(hostname) +hn=${h##*n} +parallel --will-cite -j 71 --pipepart -a lm/$(hostname)_pdfFilesWithLM doPlinks.sh ${hn}_'{#}' +echo $(date) $(hostname) diff -r 000000000000 -r 65a56c0d1c1f listHdrsJob.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/listHdrsJob.sh Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,17 @@ +#!/bin/bash +#PBS -l select=2:ncpus=36 +#PBS -l place=exclhost +#PBS -l walltime=01:00:00 +#PBS -V +#PBS -A dc007 +#PBS -N listHdrs + +module load mpt + +cd ${PBS_O_WORKDIR} + +export MPI_LAUNCH_TIMEOUT=120 +export MPI_SHEPHERD=true + +mpiexec_mpt -ppn 1 -n 1 bin/listHdrs.sh 2019-35 +wait diff -r 000000000000 -r 65a56c0d1c1f plinksJob.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plinksJob.sh Tue Feb 18 13:15:05 2020 +0000 @@ -0,0 +1,16 @@ +#!/bin/bash +#PBS -l select=2:ncpus=36 +#PBS -l place=exclhost +#PBS -l walltime=08:00:00 +#PBS -V +#PBS -A dc007 +#PBS -N plinks + +module load mpt + +cd ${PBS_O_WORKDIR} + +export MPI_SHEPHERD=true + +mpiexec_mpt -ppn 1 -n 2 bin/plinks.sh 2019-35 +