art_sam_wrap.sh
Go to the documentation of this file.
1 #!/bin/sh
2 
3 #
4 # This job assumes it's being passed:
5 # $SAM_PROJECT_NAME
6 # $EXPERIMENT
7 # $GRID_USER
8 
9 if [ "x$ART_SAM_DEBUG" = "xtrue" ]
10 then
11  set -x
12 fi
13 
14 hostname
15 uname -a
16 ls /lib*/libc-*.so
17 
18 # defaults
19 
20 dest=""
21 conf=/dev/null
22 exe=$EXPERIMENT
23 quals=nu:e4:debug
24 vers=v1_2_10
25 renam=""
26 prename=false
27 limit=""
28 getconfig=false
29 use_gdb=false
30 multifile=false
31 exports=""
32 sources=""
33 prescripts=""
34 postscripts=""
35 self_destruct_timeout=""
36 input_files=""
37 hash=""
38 hashdir=""
39 export conf
40 
41 datadir=$TMPDIR/ifdh_$$
42 
43 #set -x
44 #
45 # parse options we know, collect rest in $args
46 #
47 usage() {
48 cat <<EOF
49 Usage:
50  $0 [Options] [cmd_options]
51 
52  find ifdh_art and dependencies in CVMFS or in /nusoft/app/externals,
53  register a consumer process, and run an ART executable,
54  fetching input from a SAM Project specified by $SAM_PROJECT
55  in the environment.
56 
57  Options are:
58 
59  -h|--help
60  Print this message and exit
61 
62  -q|--quals str
63  -v|--vers version
64  Set qualifiers and version of ifdh_art to setup if
65  it isn't setup by any --source parameters.
66 
67  -X|--exe executable
68  -c|--config file
69  executable (defaults to experiment name) and config file to
70  run as executable -c config [cmd_options]
71 
72  -D|--dest url
73  specify destination path or url for copying back output
74  default is to not copy back files
75 
76  -H|--dest
77  copy out put to a hashed directory structure. This is used
78  for production copy backs to prevent to many files in one fts
79  directory.
80 
81  -R|--rename how
82  --rename2 how
83  --rename3 how
84  call "ifdh rename" to rename output files
85  ...possibly 2 or three times
86 
87  -g|--getconfig
88  get the config file as an input file from SAM
89  (i.e. for MonteCarlo simulation)
90  conflicts with --conf.
91 
92  --multifile
93  Fetches multiple files per job and runs the executable
94  once per file.
95  conflicts with --getconfig
96 
97  --confbase
98  for --getconfig, prepend this file to the fetched
99  config file before running the executable
100 
101  -G|--with-gdb
102  Run the executable under the debugger, and print a
103  stack trace if it dies
104 
105  -L|--limit
106  Pass a number of files limit to establishProcess.
107 
108  --inputfile
109  Copy this extra input file into the job area before
110  running the executable
111 
112  --addoutput pattern
113  call "ifdh addOutputFile" with files that match this
114  glob pattern (i.e. --addoutput *out.root)
115 
116  --export VAR=value
117  export the following VAR=value before running the
118  executable
119 
120  --self-destruct-timer seconds
121  suicide if the executable runs more than seconds seconds;
122  usually only use this if you have jobs that hang and you
123  get no output back
124 
125  --source file:arg:arg:...
126  --prescript file:arg:arg:...
127  --postscript file:arg:arg:...
128  source/execute the file before/after the main executable
129  is run.
130 EOF
131 }
132 
133 while [ $# -gt 0 ]
134 do
135  echo "debug: \$1 '$1' \$2 '$2'"
136  case "x$1" in
137  x-h|x--help) usage; exit 0;;
138  x-q|x--quals) quals="$2"; shift; shift; continue;;
139  x-c|x--config) conf="$2"; shift; shift; continue;;
140  x-D|x--dest) dest="$2"; shift; shift; continue;;
141  x-H|x--hash) hash=true; shift; continue;;
142  x--prename) prename=true; shift; continue;;
143  x-R|x--rename) renam="$2"; shift; shift; continue;;
144  x--rename2) renam2="$2"; shift; shift; continue;;
145  x--rename3) renam3="$2"; shift; shift; continue;;
146  x-X|x--exe) cmd="$2"; shift; shift; continue;;
147  x-v|x--vers) vers="$2"; shift; shift; continue;;
148  x-g|x--getconfig)getconfig=true; shift; continue;;
149  x--multifile) multifile=true; shift; continue;;
150  x-G|x--with-gdb)use_gdb=true; shift; continue;;
151  x-L|x--limit) limit="$2"; shift; shift; continue;;
152  x--inputfile) input_files="$input_files $2"; shift; shift; continue;;
153  x--addoutput) addoutput="$2"; shift; shift; continue;;
154  x--confbase) confbase="$2"; shift; shift; continue;;
155  x--export) exports="$exports \"$2\"" shift; shift; continue;;
156  x--source) sources="$sources \"$2\"" shift; shift; continue;;
157  x--self-destruct-timer) self_destruct_timeout=$2; shift; shift; continue;;
158  x--prescript) prescripts="$prescripts \"$2\"":; shift; shift; continue;;
159  x--postscript) postscripts="$postscripts \"$2\"":; shift; shift; continue;;
160  *) args="$args \"$1\""; shift; continue;;
161  esac
162  break
163 done
164 
165 find_ups() {
166 
167  #
168  # use our slf6 stuff for systems with 3.x kernels (i.e. MTW2)
169  #
170  case `uname -r` in
171  3.*) export UPS_OVERRIDE="-H Linux64bit+2.6-2.12";;
172  4.*) export UPS_OVERRIDE="-H Linux64bit+2.6-2.12";;
173  esac
174 
175  for path in /cvmfs/${EXPERIMENT}.opensciencegrid.org/products /cvmfs/${EXPERIMENT}.opensciencegrid.org/externals /cvmfs/oasis.opensciencegrid.org/${EXPERIMENT}/externals /cvmfs/${EXPERIMENT}cfs.fnal.gov/externals /nusoft/app/externals /grid/fermiapp/products/${EXPERIMENT}
176  do
177  if [ -r $path/setup ]
178  then
179  source $path/setup
180  return 0
181  fi
182  done
183  return 1
184 }
185 check_space() {
186  set : `df -P . | tail -1`
187  avail_blocks=$5
188  if [ $avail_blocks -lt 1024 ]
189  then
190  echo "Not enough space (only ${avail_blocks}k) on this node in `pwd`."
191  df -H .
192  return 1
193  fi
194  return 0
195 }
196 
197 kill_proc_kids_after_n() {
198  watchpid=$1
199  after_secs=$2
200  rate=10
201  sofar=0
202 
203  start=`date +%s`
204  echo "Starting self-destruct timer of $after_secs at $start"
205 
206  while kill -0 $watchpid 2> /dev/null && [ $sofar -lt $after_secs ]
207  do
208  sleep $rate
209  now=`date +%s`
210  sofar=$((now - start))
211  printf "."
212  done
213  printf "\n"
214 
215  if kill -0 $watchpid
216  then
217  pslist=`ps -ef | grep " $watchpid " | grep -v grep`
218  printf "Timed out after $sofar seconds...\n"
219  for signal in 15 9
220  do
221  echo "$pslist" |
222  while read uid pid ppid rest
223  do
224  if [ $ppid = $watchpid ]
225  then
226  echo "killing -$signal $uid $pid $ppid $rest"
227  kill -$signal $pid
228  fi
229  done
230  echo "killing -$signal $watchpid"
231  kill -$signal $watchpid
232  done
233  fi
234 }
235 
236 if [ x"$self_destruct_timeout" != x ]
237 then
238  kill_proc_kids_after_n $$ $self_destruct_timeout &
239 fi
240 
241 #
242 # if we don't have ups or enough space, try again for a bit
243 # before giving up
244 #
245 
246 hostname
247 count=0
248 until find_ups && check_space
249 do
250  count=$((count + 1))
251  if [ $count -gt 6 ]
252  then
253  echo "Timed out waiting for space and/or cvmfs ups area"
254  exit 1
255  fi
256  sleep 600
257 done
258 
259 # not sure we need this
260 if [ "x$IFDH_BASE_URI" = "x" ]
261 then
262  export IFDH_BASE_URI=http://samweb.fnal.gov:8480/sam/$EXPERIMENT/api
263 fi
264 
265 #
266 # treat colons as blanks when eval-ing sources below
267 # (need a better char? what about PATH changs?)
268 # because blanks get split by jobsub no matter what you do...
269 #
270 for blat in $exports
271 do
272  echo "doing: export $blat"
273  eval export $blat
274 done
275 
276 for blat in $sources
277 do
278  base=`echo $blat | sed -e 's/:.*//'`
279  blat=`echo $blat | sed -e 's/:/ /g'`
280  [ -x base ] || chmod +x $base
281  eval blat=$blat
282  echo "doing: source $blat"
283  eval "source $blat"
284 done
285 
286 for blat in $prescripts
287 do
288  base=`echo $blat | sed -e 's/:.*//'`
289  blat=`echo $blat | sed -e 's/:/ /g'`
290  [ -x base ] || chmod +x $base
291  eval blat=$blat
292  echo "doing: $blat"
293  eval "$blat"
294 done
295 
296 eval "confbase=$confbase"
297 
298 #
299 # make sure we have ifdh_art
300 #
301 if [ x$IFDH_ART_DIR = x ]
302 then
303  . `ups setup ifdh_art $vers -q $quals:`
304 fi
305 
306 # should not need this, but seem to for older releases -- SL5 setup on SL6 bug
307 #PATH=/bin:/usr/bin:`echo $IFDHC_DIR/Linux*/bin`:$PATH
308 PATH=`echo $IFDHC_DIR/Linux*/bin`:$PATH:/bin:/usr/bin
309 LD_LIBRARY_PATH=`echo $IFDHC_DIR/Linux*/lib`:`echo $IFDH_ART_DIR/Linux*/lib`:$LD_LIBRARY_PATH
310 
311 if [ -n "${JOBSUBJOBID}" ]
312 then
313  description="${JOBSUBJOBID}"
314 elif [ -n "${CLUSTER}"]
315 then
316  description="${CLUSTER}.${PROCESS}"
317 else
318  description=""
319 fi
320 
321 appname=$(basename $cmd)
322 
323 hostname=`hostname --fqdn`
324 projurl=`ifdh findProject $SAM_PROJECT_NAME ${SAM_STATION:-$EXPERIMENT}`
325 consumer_id=''
326 
327 consumer_id=`IFDH_DEBUG= ifdh establishProcess "$projurl" "$appname" "$ART_VERSION" "$hostname" "$GRID_USER" "art" "$description" "$limit"`
328 if [ "$consumer_id" = '' ]
329 then
330  echo "Unable to establish consumer id!"
331  echo "Unable to establish consumer id!" >&2
332  exit
333 fi
334 
335 echo project url: $projurl
336 echo consumer id: $consumer_id
337 
338 #
339 # override flags for grid copies..
340 # this should be in ifdh_cp, but until it is...
341 #
342 export IFDH_GRIDFTP_EXTRA="-p 0 -dp"
343 
344 #
345 # Joe says not to do this...
346 #
347 #cd $TMPDIR
348 check_space
349 
350 echo "Active ups products:"
351 ups active
352 
353 if [ -n "$input_files" ]
354 then
355  ifdh cp -D $input_files .
356 fi
357 
358 if [ ! -z "${TARFILE}" ] ; then
359  if [ ! -f "${TARFILE}" ] ; then
360  echo "ERROR Tar file ${TARFILE} doesn't exist"
361  exit -1
362  fi
363  tar xzf $TARFILE
364  srt_setup -a
365 
366 fi
367 
368 if $getconfig
369 then
370 
371  echo "Getconfig case:"
372 
373  res=0
374  while [ "$res" = 0 ]
375  do
376  uri=`IFDH_DEBUG= ifdh getNextFile $projurl $consumer_id | tail -1`
377  if [ x"$uri" == x ]
378  then
379  break
380  fi
381 
382  fname=`IFDH_DEBUG= ifdh fetchInput "$uri" | tail -1 `
383  if [ $? != 0 ]
384  then
385  echo "Error: unable to fetch input file $uri" >&2
386  continue
387  fi
388 
389  if [ x$confbase != x ]
390  then
391  cat $confbase $fname > $fname.new
392  mv $fname.new $fname
393  fi
394 
395  echo "config is now:"
396  echo "=============="
397  cat $fname
398  echo "=============="
399  conf=$fname
400  datestamp=`date +%F-%H-%M-%S`
401  echo conf is $conf
402  ifdh updateFileStatus $projurl $consumer_id $fname transferred
403 
404  #out=`basename $fname | sed -e "s/.fcl$/$datestamp.root/"`
405  #command="\"${cmd}\" -c \"$conf\" $args -o $out --process-name test"
406  command="\"${cmd}\" -c \"$conf\" $args "
407 
408  echo "Running: $command"
409 
410  if eval "$command"
411  then
412  ifdh updateFileStatus $projurl $consumer_id $fname consumed
413  else
414  res=$?
415  ifdh updateFileStatus $projurl $consumer_id $fname skipped
416  fi
417  #uri=`ifdh getNextFile $projurl $consumer_id`
418  done
419 elif $multifile
420 then
421  echo "Multi-file case:"
422 
423  echo ""
424  echo "--------------------------------------------------------"
425  echo "PATH:"
426  echo $PATH
427  echo ""
428  echo "Python environment variables:"
429  env | grep PYTHON
430  echo ""
431  echo "Path to python executable"
432  which python
433  echo "--------------------------------------------------------"
434  echo ""
435 
436  res=0
437  while :
438  do
439  uri=`IFDH_DEBUG= ifdh getNextFile $projurl $consumer_id | tail -1`
440  if [ x"$uri" == x ]
441  then
442  break
443  fi
444  fname=`IFDH_DEBUG= ifdh fetchInput "$uri" | tail -1 `
445  echo "got file: $fname"
446  ifdh updateFileStatus $projurl $consumer_id $fname transferred
447 
448  command="\"${cmd}\" -c \"$conf\" $args $fname"
449 
450  echo "Running: $command"
451  if eval "$command"
452  then
453  ifdh updateFileStatus $projurl $consumer_id $fname consumed
454  else
455  res=$?
456  ifdh updateFileStatus $projurl $consumer_id $fname skipped
457  fi
458  done
459 else
460  echo "Not Getconfig case:"
461 
462  update_via_fcl=false
463 
464  : $update_via_fcl
465 
466  if $update_via_fcl
467  then
468 
469  cp $conf ${TMPDIR:=/var/tmp}/conf.$$
470  conf=$TMPDIR/conf.$$
471  cat >> $conf <<EOF
472  services.user.IFDH: {}
473  services.user.IFDH.debug: "1"
474  services.user.CatalogInterface.service_provider: "IFCatalogInterface"
475  services.user.CatalogInterface.webURI: "$projurl"
476  services.user.FileTransfer.service_provider: "IFFileTransfer"
477  source.fileNames: [ "$consumer_id" ]
478 EOF
479 
480  else
481  args="$args \"--sam-web-uri=$projurl\" \"--sam-process-id=$consumer_id\""
482  fi
483 
484  #
485  #debugging
486  #
487  # ups active
488  # printenv
489 
490  command="\"${cmd}\" -c \"$conf\" $args"
491 
492  if $use_gdb
493  then
494  printf 'run\nwhere\nquit\n' > gdbcmds
495  command="gdb -x gdbcmds --args $command"
496  fi
497 
498  echo "Running: $command"
499  eval "$command"
500  res=$?
501 
502  if $update_via_fcl
503  then
504  rm ${conf}
505  fi
506 
507 fi
508 
509 for blat in $postscripts
510 do
511  base=`echo $blat | sed -e 's/:.*//'`
512  blat=`echo $blat | sed -e 's/:/ /g'`
513  [ -x base ] || chmod +x $base
514  eval blat=$blat
515  echo "doing: $blat"
516  eval "$blat"
517 done
518 
519 if [ "$prename" = "true" -a "$res" = "0" ]
520 then
521  for f in $addoutput
522  do
523  newname=`$SRT_PUBLIC_CONTEXT/Metadata/samUtils/get_new_file_name $f`
524  fpath=`dirname $f`
525  mv $f $fpath/$newname
526  done
527 fi
528 
529 if [ "$hash" = "true" -a "$res" = "0" ]
530 then
531  for f in $addoutput
532  do
533  hashdir=`python -c 'import hashlib, sys;print "/".join(hashlib.md5(sys.argv[1]).hexdigest()[:3])' $f`
534  done
535 fi
536 
537 if [ "x$addoutput" != "x" -a "$res" = "0" ]
538 then
539  for f in $addoutput
540  do
541  ifdh addOutputFile $f
542  done
543 fi
544 
545 if [ "x$renam" != "x" -a "$res" = "0" ]
546 then
547  ifdh renameOutput $renam
548 fi
549 
550 if [ "x$renam2" != "x" -a "$res" = "0" ]
551 then
552  ifdh renameOutput $renam2
553 fi
554 
555 if [ "x$renam3" != "x" -a "$res" = "0" ]
556 then
557  ifdh renameOutput $renam3
558 fi
559 
560 case `hostname` in
561 *.smu.edu) export IFDH_STAGE_VIA='srm://smuosgse.hpc.smu.edu:8443/srm/v2/server?SFN=/data/srm'
562  echo "turning on staging and for SMU..."
563  ;;
564 esac
565 
566 if [ "x$dest" != "x" -a "$res" = "0" ]
567 then
568  # workaround for srmls hangs
569  export SRM_JAVA_OPTIONS=-Xmx2048m
570 
571  voms-proxy-info -all
572 
573  ifdh copyBackOutput "$dest/$hashdir/"
574 fi
575 
576 
577 if [ "$res" = 0 ]
578 then
579  ifdh setStatus "$projurl" "$consumer_id" completed
580 else
581  ifdh setStatus "$projurl" "$consumer_id" bad
582 fi
583 
584 ifdh endProcess "$projurl" "$consumer_id"
585 
586 ifdh cleanup -x
587 
588 # cleanup temporary script dir
589 rm -rf $dp
590 
591 # clean up usual detritus
592 rm -f *.fcl *.raw *.root t_* stage_*
593 
594 exit $res