changeset 0:65a56c0d1c1f

bolting the barn door...
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 18 Feb 2020 13:15:05 +0000
parents
children a4b0359456bc
files .bashrc .emacs bin/doPlinks.sh bin/plinks.py bin/plinks.sh listHdrsJob.sh plinksJob.sh
diffstat 7 files changed, 149 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.bashrc	Tue Feb 18 13:15:05 2020 +0000
@@ -0,0 +1,51 @@
+# .bashrc
+
+# Source global definitions
+if [ -f /etc/bashrc ]; then
+	. /etc/bashrc
+fi
+
+# Uncomment the following line if you don't like systemctl's auto-paging feature:
+# export SYSTEMD_PAGER=
+
+# User specific aliases and functions
+module add gnu-parallel
+export N1=rli4n35
+export N2=rli6n35
+export NN="$N1 $N2"
+
+# Tools
+function lss() {
+  /bin/ls -l "$@" | sort -nr -k 5,5
+  }
+
+function sus() {
+  sort "$@" | uniq -c | sort -k1nr,1
+}
+
+function tot ()
+{
+    awk '{sum+=$1} END {printf "%u\n",sum}'
+}
+
+btot ()
+{
+    python3 -c 'import sys
+n=0
+for l in sys.stdin:
+ n+=int(l)
+print(n)
+'
+}
+
+function typecat ()
+{
+  tt=$(type $1)
+  case $tt in
+   *\ is\ hashed\ *) cat $(echo $tt | cut -f 4 -d ' ' | tr -d '()') ;;
+   *\ is\ /*) cat $(echo $tt | cut -f 3 -d ' ' | tr -d '()') ;;
+   *) type $1 ;;
+  esac
+}
+export HISTSIZE=3000
+export PYTHONPATH=$HOME/lib/python3.6/site-packages
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.emacs	Tue Feb 18 13:15:05 2020 +0000
@@ -0,0 +1,18 @@
+;; .emacs
+
+(custom-set-variables
+ ;; custom-set-variables was added by Custom.
+ ;; If you edit it by hand, you could mess it up, so be careful.
+ ;; Your init file should contain only one such instance.
+ ;; If there is more than one, they won't work right.
+ '(diff-switches "-u")
+ '(text-mode-hook (quote (turn-on-auto-fill text-mode-hook-identify))))
+
+;;; uncomment for CJK utf-8 support for non-Asian users
+;; (require 'un-define)
+(custom-set-faces
+ ;; custom-set-faces was added by Custom.
+ ;; If you edit it by hand, you could mess it up, so be careful.
+ ;; Your init file should contain only one such instance.
+ ;; If there is more than one, they won't work right.
+ '(default ((t (:family "DejaVu Sans Mono" :foundry "unknown" :slant normal :weight normal :height 98 :width normal)))))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/doPlinks.sh	Tue Feb 18 13:15:05 2020 +0000
@@ -0,0 +1,15 @@
+#!/usr/bin/bash
+me=$1
+mine=0
+mkdir /dev/shm/x
+while read f
+do
+    if plinks.py $f > /dev/shm/x/links_${me}_${mine} 2>/dev/null
+    then
+	((mine+=1))
+    else
+	echo $f >> /dev/shm/x/badpdfs_$me
+	rm -f /dev/shm/x/links_${me}_${mine}
+    fi
+done
+rsync -a /dev/shm/x/ links
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/plinks.py	Tue Feb 18 13:15:05 2020 +0000
@@ -0,0 +1,24 @@
+#!/lustre/sw/miniconda3/bin/python3
+import sys,pdfx,traceback
+def run():
+  global pdf
+  try:
+    pdf=pdfx.PDFx(sys.argv[1])
+    links=pdf.get_references_as_dict()
+  except:
+    traceback.print_exc()
+    print("\nFailed: %s"%sys.argv[1],file=sys.stderr)
+    exit(1)
+  for k in links.keys():
+    for l in links[k]:
+         print("%s\t%s"%(k,l))
+
+if sys.argv[1]=='-t':
+  import timeit
+  sys.argv.pop(1)
+  n=sys.argv[1]
+  sys.argv.pop(1)
+  print(timeit.timeit("run()",number=int(n),
+                      setup="from __main__ import run"),file=sys.stderr)
+else:
+  run()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/plinks.sh	Tue Feb 18 13:15:05 2020 +0000
@@ -0,0 +1,8 @@
+#!/usr/bin/bash
+mkdir -p $TMPDIR
+echo $(date) $(hostname)
+cd /beegfs/common_crawl/CC-MAIN-2019-35/pdfs
+h=$(hostname)
+hn=${h##*n}
+parallel --will-cite -j 71 --pipepart -a lm/$(hostname)_pdfFilesWithLM doPlinks.sh ${hn}_'{#}'
+echo $(date) $(hostname)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/listHdrsJob.sh	Tue Feb 18 13:15:05 2020 +0000
@@ -0,0 +1,17 @@
+#!/bin/bash
+#PBS -l select=2:ncpus=36
+#PBS -l place=exclhost
+#PBS -l walltime=01:00:00
+#PBS -V
+#PBS -A dc007
+#PBS -N listHdrs
+
+module load mpt
+
+cd ${PBS_O_WORKDIR}
+
+export MPI_LAUNCH_TIMEOUT=120
+export MPI_SHEPHERD=true
+
+mpiexec_mpt -ppn 1 -n 1 bin/listHdrs.sh 2019-35
+wait
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/plinksJob.sh	Tue Feb 18 13:15:05 2020 +0000
@@ -0,0 +1,16 @@
+#!/bin/bash
+#PBS -l select=2:ncpus=36
+#PBS -l place=exclhost
+#PBS -l walltime=08:00:00
+#PBS -V
+#PBS -A dc007
+#PBS -N plinks
+
+module load mpt
+
+cd ${PBS_O_WORKDIR}
+
+export MPI_SHEPHERD=true
+
+mpiexec_mpt -ppn 1 -n 2 bin/plinks.sh 2019-35
+