2 #------------------------------------------------------------------
4 # Purpose: A batch script to fetch root files from sam and combine
7 # Adapted from condor_lBdetMC.sh by E. Church.
11 # condor_hadd_sam.sh [options]
15 # -c, --config - For compatibility (ignored).
16 # -T, --TFileName <arg> - TFile output file name
17 # --nfile <arg> - Number of files to process per worker.
19 # Sam and parallel project options.
21 # --sam_user <arg> - Specify sam user (default $GRID_USER).
22 # --sam_group <arg> - Specify sam group (default --group option).
23 # --sam_station <arg> - Specify sam station (default --group option).
24 # --sam_defname <arg> - Sam dataset definition name.
25 # --sam_project <arg> - Sam project name.
26 # --sam_start - Specify that this worker should be responsible for
27 # starting and stopping the sam project.
28 # --recur - Recursive input dataset (force snapshot).
29 # --sam_schema <arg> - Use this option with argument "root" to stream files using
30 # --os <arg> - A copy of the os argument passed to jobsub. May be used
31 # to affect definition of UPS_OVERRIDE.
32 # --data_file_type - Specify data file type (default "root," repeatable).
36 # --ups <arg> - Comma-separated list of top level run-time ups products.
37 # -r, --release <arg> - Release tag.
38 # -q, -b, --build <arg> - Release build qualifier (default "debug", or "prof").
39 # --localdir <arg> - Larsoft local test release directory (default none).
40 # --localtar <arg> - Tarball of local test release.
41 # --mrb - Ignored (for compatibility).
42 # --srt - Exit with error status (SRT run time no longer supported).
46 # -h, --help - Print help.
47 # -i, --interactive - For interactive use.
48 # -g, --grid - Be grid-friendly.
49 # --group <arg> - Group or experiment (required).
50 # --workdir <arg> - Work directory (required).
51 # --outdir <arg> - Output directory (required).
52 # --logdir <arg> - Log directory (required).
53 # --scratch <arg> - Scratch directory (only for interactive).
54 # --cluster <arg> - Job cluster (override $CLUSTER)
55 # --process <arg> - Process within cluster (override $PROCESS).
56 # --procmap <arg> - Name of process map file (override $PROCESS).
57 # --init-script <arg> - User initialization script execute.
58 # --init-source <arg> - User initialization script to source (bash).
59 # --end-script <arg> - User end-of-job script to execute.
60 # --init <path> - Absolute path of environment initialization script.
64 # Run time environment setup.
66 # MRB run-time environmental setup is controlled by four options:
67 # --release (-r), --build (-b, -q), --localdir, and --localtar.
69 # a) Use option --release or -r to specify version of top-level product(s).
70 # b) Use option --build or -b to specify build full qualifiers (e.g.
71 # "debug:e5" or "e5:prof").
72 # c) Options --localdir or --localtar are used to specify your local
73 # test release. Use one or the other (not both).
75 # Use --localdir to specify the location of your local install
76 # directory ($MRB_INSTALL).
78 # Use --localtar to specify thye location of a tarball of your
79 # install directory (made relative to $MRB_INSTALL).
81 # Note that --localdir is not grid-friendly.
85 # 1. Each batch worker is uniquely identified by two numbers stored
86 # in environment variables $CLUSTER and $PROCESS (the latter is
87 # a small integer that starts from zero and varies for different
88 # jobs in a parallel job group). These environment variables are
89 # normally set by the batch system, but can be overridden by options
90 # --cluster, --process, and --procmap (e.g. to rerun failed jobs).
92 # 2. The work directory must be set to an existing directory owned
93 # by the submitter and readable by the batch worker. Files from the
94 # work directory are copied to the batch worker scratch directory at
95 # the start of the job.
97 # 3. The initialization and end-of-job
98 # scripts (optins --init-script, --init-source, --end-script) may
99 # be stored in the work directory specified by option --workdir, or they
100 # may be specified as absolute paths visible on the worker node.
102 # 4. A local test release may be specified as an absolute path using
103 # --localdir, or a tarball using --localtar. The location of the tarball
104 # may be specified as an absolute path visible on the worker, or a
105 # relative path relative to the work directory.
107 # 5. The output directory must exist and be writable by the batch
108 # worker (i.e. be group-writable for grid jobs). The worker
109 # makes a new subdirectory called ${CLUSTER}_${PROCESS} in the output
110 # directory and copies all files in the batch scratch directory there
111 # at the end of the job. If the output directory is not specified, the
112 # default is /grid/data/<group>/outstage/<user> (user is defined as
113 # owner of work directory).
115 # 6. This script reads input files from sam using the standard sam project api.
116 # All files are fetched from sam, then then are combined by a single
117 # invocation of hadd. This way of working implies an upper limit on
118 # the number of files that can be combined in a single worker.
121 # Created: H. Greenlee, 29-Aug-2012
123 #------------------------------------------------------------------
158 declare -a DATAFILETYPES
160 while [ $# -gt 0 ]; do
165 awk '/^# Usage:/,/^# End options/{print $0}' $0 | cut -c3- | head -n -2
169 # Config file (for compatibility -- ignored).
171 if [ $# -gt 1 ]; then
176 # Number of events (for compabitility -- ignored).
178 if [ $# -gt 1 ]; then
185 if [ $# -gt 1 ]; then
191 # Number of files to process.
193 if [ $# -gt 1 ]; then
199 # Specify data file types (repeatable).
201 if [ $# -gt 1 ]; then
202 ntype=${#DATAFILETYPES[@]}
203 DATAFILETYPES[$ntype]=$2
210 if [ $# -gt 1 ]; then
218 if [ $# -gt 1 ]; then
226 if [ $# -gt 1 ]; then
232 # Sam dataset definition name.
234 if [ $# -gt 1 ]; then
242 if [ $# -gt 1 ]; then
248 # Sam start/stop project flag.
260 if [ $# -gt 1 ]; then
268 if [ $# -gt 1 ]; then
274 # General arguments for hadd command line.
276 if [ $# -gt 1 ]; then
283 # Top level ups products (comma-separated list).
285 if [ $# -gt 1 ]; then
293 if [ $# -gt 1 ]; then
299 # Release build qualifier.
301 if [ $# -gt 1 ]; then
307 # Local test release directory.
309 if [ $# -gt 1 ]; then
315 # Local test release tarball.
317 if [ $# -gt 1 ]; then
329 echo "SRT run time environment is no longer supported."
338 # Grid flag (no effect).
344 if [ $# -gt 1 ]; then
352 if [ $# -gt 1 ]; then
360 if [ $# -gt 1 ]; then
368 if [ $# -gt 1 ]; then
376 if [ $# -gt 1 ]; then
384 if [ $# -gt 1 ]; then
390 # Process within cluster.
392 if [ $# -gt 1 ]; then
400 if [ $# -gt 1 ]; then
406 # User initialization script.
408 if [ $# -gt 1 ]; then
414 # User source initialization script.
416 if [ $# -gt 1 ]; then
422 # User end-of-job script.
424 if [ $# -gt 1 ]; then
430 # Specify environment initialization script path.
432 if [ $# -gt 1 ]; then
440 echo "Unknown option $1"
451 #echo "LOCALDIR=$LOCALDIR"
452 #echo "LOCALTAR=$LOCALTAR"
453 #echo "INTERACTIVE=$INTERACTIVE"
455 #echo "WORKDIR=$WORKDIR"
456 #echo "OUTDIR=$OUTDIR"
457 #echo "LOGDIR=$LOGDIR"
458 #echo "SCRATCH=$SCRATCH"
461 #echo "INITSCRIPT=$INITSCRIPT"
462 #echo "INITSOURCE=$INITSOURCE"
463 #echo "ENDSCRIPT=$ENDSCRIPT"
465 # Set default data file types ("root").
467 if [ ${#DATAFILETYPES[@]} -eq 0 ]; then
468 DATAFILETYPES[0]=root
471 # Done with arguments.
473 echo "Nodename: `hostname -f`"
480 if [ x$QUAL = x ]; then
484 if [ x$SAM_GROUP = x ]; then
488 if [ x$SAM_STATION = x ]; then
492 # Standardize sam_schema (xrootd -> root, xroot -> root).
494 if [ x$SAM_SCHEMA = xxrootd ]; then
497 if [ x$SAM_SCHEMA = xxroot ]; then
501 # Make sure work directory is defined and exists.
503 if [ x$WORKDIR = x ]; then
504 echo "Work directory not specified."
507 echo "Work directory: $WORKDIR"
509 # Initialize experiment ups products and mrb.
511 echo "Initializing ups and mrb."
513 if [ x$INIT != x ]; then
514 if [ ! -f $INIT ]; then
515 echo "Environment initialization script $INIT not found."
518 echo "Sourcing $INIT"
521 echo "Sourcing setup_experiment.sh"
522 source ${CONDOR_DIR_INPUT}/setup_experiment.sh
525 echo PRODUCTS=$PRODUCTS
527 # Ifdh may already be setup by jobsub wrapper.
528 # If not, set it up here.
530 echo "IFDHC_DIR=$IFDHC_DIR"
531 if [ x$IFDHC_DIR = x ]; then
532 echo "Setting up ifdhc, because jobsub did not set it up."
535 echo "IFDHC_DIR=$IFDHC_DIR"
537 # Set GROUP environment variable.
540 if [ x$GRP != x ]; then
543 echo "GROUP not specified."
549 echo "IFDH_OPT=$IFDH_OPT"
551 # Make sure output directory exists and is writable.
553 if [ x$OUTDIR = x ]; then
554 echo "Output directory not specified."
557 echo "Output directory: $OUTDIR"
559 # Make sure log directory exists and is writable.
561 if [ x$LOGDIR = x ]; then
562 echo "Log directory not specified."
565 echo "Log directory: $LOGDIR"
567 # Make sure scratch directory is defined.
568 # For batch, the scratch directory is always $_CONDOR_SCRATCH_DIR
569 # For interactive, the scratch directory is specified by option
570 # --scratch or --outdir.
572 if [ $INTERACTIVE -eq 0 ]; then
573 SCRATCH=$_CONDOR_SCRATCH_DIR
575 if [ x$SCRATCH = x ]; then
579 if [ x$SCRATCH = x -o ! -d "$SCRATCH" -o ! -w "$SCRATCH" ]; then
580 echo "Local scratch directory not defined or not writable."
584 # Create the scratch directory in the condor scratch diretory.
585 # Copied from condor_lBdetMC.sh.
586 # Scratch directory path is stored in $TMP.
587 # Scratch directory is automatically deleted when shell exits.
589 # Do not change this section.
590 # It creates a temporary working directory that automatically cleans up all
591 # leftover files at the end.
592 TMP=`mktemp -d ${SCRATCH}/working_dir.XXXXXXXXXX`
593 TMP=${TMP:-${SCRATCH}/working_dir.$$}
595 { [[ -n "$TMP" ]] && mkdir -p "$TMP"; } || \
596 { echo "ERROR: unable to create temporary directory!" 1>&2; exit 1; }
597 trap "[[ -n \"$TMP\" ]] && { rm -rf \"$TMP\"; }" 0
599 # End of the section you should not change.
601 echo "Scratch directory: $TMP"
603 # Copy files from work directory to scratch directory.
605 echo "No longer fetching files from work directory."
606 echo "that's now done with using jobsub -f commands"
608 cp ${CONDOR_DIR_INPUT}/* ./work/
610 echo "Local working directoroy:"
615 # Save the hostname and condor job id.
617 hostname > hostname.txt
618 echo ${CLUSTER}.${PROCESS} > jobid.txt
620 # Set default CLUSTER and PROCESS environment variables for interactive jobs.
622 if [ $INTERACTIVE -ne 0 ]; then
623 CLUSTER=`date +%s` # From time stamp.
624 PROCESS=0 # Default zero for interactive.
627 # Override CLUSTER and PROCESS from command line options.
629 if [ x$CLUS != x ]; then
632 if [ x$PROC != x ]; then
635 if [ x$PROCMAP != x ]; then
636 if [ -f $PROCMAP ]; then
637 PROCESS=`sed -n $(( $PROCESS + 1 ))p $PROCMAP`
639 echo "Process map file $PROCMAP not found."
643 if [ x$CLUSTER = x ]; then
644 echo "CLUSTER not specified."
647 if [ x$PROCESS = x ]; then
648 echo "PROCESS not specified."
651 echo "Procmap: $PROCMAP"
652 echo "Cluster: $CLUSTER"
653 echo "Process: $PROCESS"
655 # Construct name of output subdirectory.
657 OUTPUT_SUBDIR=${CLUSTER}_${PROCESS}
658 echo "Output subdirectory: $OUTPUT_SUBDIR"
660 # Make sure init script exists and is executable (if specified).
662 if [ x$INITSCRIPT != x ]; then
663 if [ -f "$INITSCRIPT" ]; then
666 echo "Initialization script $INITSCRIPT does not exist."
671 # Make sure init source script exists (if specified).
673 if [ x$INITSOURCE != x -a ! -f "$INITSOURCE" ]; then
674 echo "Initialization source script $INITSOURCE does not exist."
678 # Make sure end-of-job script exists and is executable (if specified).
680 if [ x$ENDSCRIPT != x ]; then
681 if [ -f "$ENDSCRIPT" ]; then
684 echo "Initialization script $ENDSCRIPT does not exist."
689 # MRB run time environment setup goes here.
691 # Setup local test release, if any.
693 if [ x$LOCALDIR != x ]; then
697 # Copy test release directory recursively.
699 echo "Copying local test release from directory ${LOCALDIR}."
701 # Make sure ifdhc is setup.
703 if [ x$IFDHC_DIR = x ]; then
704 echo "Setting up ifdhc before fetching local directory."
707 echo "IFDHC_DIR=$IFDHC_DIR"
708 ifdh cp -r $IFDH_OPT $LOCALDIR .
710 if [ $stat -ne 0 ]; then
711 echo "ifdh cp failed with status ${stat}."
714 find . -name \*.py -exec chmod +x {} \;
715 find . -name \*.sh -exec chmod +x {} \;
717 # Setup the environment.
720 echo "Initializing localProducts from ${LOCALDIR}."
721 if [ ! -f $TMP/local/setup ]; then
722 echo "Local test release directory $LOCALDIR does not contain a setup script."
725 sed "s@setenv MRB_INSTALL.*@setenv MRB_INSTALL ${TMP}/local@" $TMP/local/setup | \
726 sed "s@setenv MRB_TOP.*@setenv MRB_TOP ${TMP}@" > $TMP/local/setup.local
727 . $TMP/local/setup.local
728 #echo "MRB_INSTALL=${MRB_INSTALL}."
729 #echo "MRB_QUALS=${MRB_QUALS}."
730 echo "Setting up all localProducts."
731 if [ x$IFDHC_DIR != x ]; then
738 # Setup local larsoft test release from tarball.
740 if [ x$LOCALTAR != x ]; then
746 echo "Fetching test release tarball ${LOCALTAR}."
748 # Make sure ifdhc is setup.
750 if [ x$IFDHC_DIR = x ]; then
751 echo "Setting up ifdhc before fetching tarball."
754 echo "IFDHC_DIR=$IFDHC_DIR"
755 ifdh cp $LOCALTAR local.tar
757 if [ $stat -ne 0 ]; then
758 echo "ifdh cp failed with status ${stat}."
762 # Extract the tarball.
766 # Setup the environment.
769 echo "Initializing localProducts from tarball ${LOCALTAR}."
770 sed "s@setenv MRB_INSTALL.*@setenv MRB_INSTALL ${TMP}/local@" $TMP/local/setup | \
771 sed "s@setenv MRB_TOP.*@setenv MRB_TOP ${TMP}@" > $TMP/local/setup.local
772 . $TMP/local/setup.local
773 #echo "MRB_INSTALL=${MRB_INSTALL}."
774 #echo "MRB_QUALS=${MRB_QUALS}."
775 echo "Setting up all localProducts."
776 if [ x$IFDHC_DIR != x ]; then
782 # Setup specified version of top level run time products
783 # (if specified, and if local test release did not set them up).
785 if [ x$IFDHC_DIR != x ]; then
789 for prd in `echo $UPS_PRDS | tr , ' '`
791 if ! ups active | grep -q $prd; then
792 echo "Setting up $prd $REL -q ${QUAL}."
793 setup $prd $REL -q $QUAL
801 # In case mrb setup didn't setup a version of ifdhc, set up ifdhc again.
803 if [ x$IFDHC_DIR = x ]; then
804 echo "Setting up ifdhc again, because larsoft did not set it up."
807 echo "IFDH_ART_DIR=$IFDH_ART_DIR"
808 echo "IFDHC_DIR=$IFDHC_DIR"
810 # Start project (if necessary), and consumer process.
815 # Make sure a project name has been specified.
817 if [ x$SAM_PROJECT = x ]; then
818 echo "No sam project was specified."
821 echo "Sam project: $SAM_PROJECT"
823 # Start project (if requested).
825 if [ $SAM_START -ne 0 ]; then
827 # If recursive flag, take snapshot of input dataset.
829 if [ $RECUR -ne 0 ]; then
830 echo "Forcing snapshot"
831 SAM_DEFNAME=${SAM_DEFNAME}:force
836 if [ x$SAM_DEFNAME != x ]; then
838 echo "Starting project $SAM_PROJECT using sam dataset definition $SAM_DEFNAME"
839 ifdh startProject $SAM_PROJECT $SAM_STATION $SAM_DEFNAME $SAM_USER $SAM_GROUP
840 if [ $? -eq 0 ]; then
841 echo "Start project succeeded."
843 echo "Start projet failed."
847 echo "Start project requested, but no definition was specified."
852 # Get the project url of a running project (maybe the one we just started,
853 # or maybe started externally). This command has to succeed, or we can't
856 PURL=`ifdh findProject $SAM_PROJECT $SAM_STATION`
857 if [ x$PURL = x ]; then
858 echo "Unable to find url for project ${SAM_PROJECT}."
861 echo "Project url: $PURL"
864 # Start the consumer process. This command also has to succeed.
870 echo "Starting consumer process."
871 echo "ifdh establishProcess $PURL $APPNAME $REL $NODE $SAM_USER $APPFAMILY hadd $NFILE $SAM_SCHEMA"
872 CPID=`ifdh establishProcess $PURL $APPNAME $REL $NODE $SAM_USER $APPFAMILY hadd $NFILE $SAM_SCHEMA`
873 if [ x$CPID = x ]; then
874 echo "Unable to start consumer process for project url ${PURL}."
877 echo "Consumer process id $CPID"
880 # Stash away the project name and consumer process id in case we need them
881 # later for bookkeeping.
883 echo $SAM_PROJECT > sam_project.txt
884 echo $CPID > cpid.txt
886 # Run/source optional initialization scripts.
888 if [ x$INITSCRIPT != x ]; then
889 echo "Running initialization script ${INITSCRIPT}."
890 if ! ./${INITSCRIPT}; then
894 if [ x$INITSOURCE != x ]; then
895 echo "Sourcing initialization source script ${INITSOURCE}."
898 if [ $status -ne 0 ]; then
903 # Save a copy of the environment, which can be helpful for debugging.
907 # Fetch files and construct local input list.
908 # Keep going until we have fetched $NFILE files or no more files are available.
910 rm -f condor_hadd_input.list
911 rm -f transferred_uris.list
912 touch condor_hadd_input.list
913 touch transferred_uris.list
915 while [ $NFILE -gt 0 ]
917 NFILE=$(( $NFILE - 1 ))
919 # Get uri of the next file
921 fileuri=`ifdh getNextFile $PURL $CPID`
923 if [ $stat != 0 ]; then
924 echo "ifdh getNextFile returned status $stat"
927 if [ x$fileuri = x ]; then
928 echo "ifdh getNextFile did not return anything."
932 # Break out of the loop if the same uri is returned twice.
934 if grep -q $fileuri transferred_uris.list; then
935 echo "File $filename was returned twice by sam."
939 # Find the local path to which this uri will be fetched.
942 if [[ ! $fileuri =~ ^root: ]]; then
943 filepath=`ifdh localPath $fileuri`
945 if [ $stat != 0 ]; then
946 echo "ifdh localPath returned status $stat"
949 if [ x$filepath = x ]; then
950 echo "ifdh localPath did not return anything."
956 ifdh fetchInput $fileuri
958 if [ $stat != 0 ]; then
959 echo "ifdh fetchInput returned status $stat"
962 if [ ! -f $filepath ]; then
963 echo "Transferred file $fileuri not found."
968 # If we get to here, file has been transferred successfully.
969 # Update the file status to consumed.
971 filename=`basename $filepath`
972 ifdh updateFileStatus $PURL $CPID $filename consumed
976 echo $fileuri >> transferred_uris.list
977 echo $filepath >> condor_hadd_input.list
983 hadd $TFILE @condor_hadd_input.list
985 echo $stat > hadd.stat
986 echo $stat > lar.stat
987 echo "hadd completed with exit status ${stat}."
989 # Setup up current version of ifdhc (may be different than version setup by larsoft).
991 #echo "Setting up current version of ifdhc."
992 #if [ x$IFDHC_DIR != x ]; then
996 echo "IFDHC_DIR=$IFDHC_DIR"
999 # Get list of consumed files.
1001 ifdh translateConstraints "consumer_process_id $CPID and consumed_status consumed" > consumed_files.list
1003 # End consumer process.
1005 ifdh endProcess $PURL $CPID
1007 # Stop project (if appropriate).
1009 if [ $SAM_START -ne 0 ]; then
1010 echo "Stopping project."
1011 ifdh endProject $PURL
1014 # Delete input files.
1016 if [ -f condor_hadd_input.list -a x$SAM_SCHEMA != xroot ]; then
1019 done < condor_hadd_input.list
1022 # Run optional end-of-job script.
1024 if [ x$ENDSCRIPT != x ]; then
1025 echo "Running end-of-job script ${ENDSCRIPT}."
1026 if ! ./${ENDSCRIPT}; then
1031 # Do root file checks.
1033 # Randomize the names of the output root files.
1034 for root in *.root; do
1035 base=`basename $root .root`_`uuidgen`
1036 mv $root ${base}.root
1037 if [ -f ${root}.json ]; then
1038 mv ${root}.json ${base}.root.json
1042 # Calculate root metadata for all root files and save as json file.
1043 # If json metadata already exists, merge with newly geneated root metadata.
1045 for root in *.root; do
1046 if [ -f $root ]; then
1048 if [ -f $json ]; then
1049 ./root_metadata.py --output="${json}2" $root >& /dev/null
1050 ./merge_json.py $json ${json}2 > ${json}3
1051 mv -f ${json}3 $json
1054 ./root_metadata.py --output="$json" $root >& /dev/null
1059 # Make local output directories for files that don't have a subrun.
1064 # Stash all of the files we want to save in a local directories that we just created.
1066 # First move .root and corresponding .json files into the out and log subdirectories.
1068 for root in *.root; do
1069 if [ -f $root ]; then
1071 if [ -f ${root}.json ]; then
1077 # Copy any remaining files into all log subdirectories.
1078 # These small files get replicated.
1080 for outfile in *; do
1081 if [ -f $outfile ]; then
1086 # Make a tarball of the log directory contents, and save the tarball in the log directory.
1089 tar -cjf log.tar -C log .
1092 # Create remote output and log directories.
1094 export IFDH_CP_MAXRETRIES=5
1096 echo "Make directory ${LOGDIR}/${OUTPUT_SUBDIR}."
1098 ifdh mkdir $IFDH_OPT ${LOGDIR}/$OUTPUT_SUBDIR
1099 echo "Done making directory ${LOGDIR}/${OUTPUT_SUBDIR}."
1102 if [ ${OUTDIR} != ${LOGDIR} ]; then
1103 echo "Make directory ${OUTDIR}/${OUTPUT_SUBDIR}."
1105 ifdh mkdir $IFDH_OPT ${OUTDIR}/$OUTPUT_SUBDIR
1106 echo "Done making directory ${OUTDIR}/${OUTPUT_SUBDIR}."
1110 # Transfer tarball in log subdirectory.
1115 echo "ifdh cp -D $IFDH_OPT log/log.tar ${LOGDIR}/$OUTPUT_SUBDIR"
1116 ifdh cp -D $IFDH_OPT log/log.tar ${LOGDIR}/$OUTPUT_SUBDIR
1119 if [ $stat -ne 0 ]; then
1121 echo "ifdh cp failed with status ${stat}."
1124 # Transfer root files in out subdirectory.
1126 if [ "$( ls -A out )" ]; then
1127 echo "ifdh cp -D $IFDH_OPT out/* ${OUTDIR}/$OUTPUT_SUBDIR"
1128 ifdh cp -D $IFDH_OPT out/* ${OUTDIR}/$OUTPUT_SUBDIR
1130 if [ $stat -ne 0 ]; then
1132 echo "ifdh cp failed with status ${stat}."
1136 if [ $statout -eq 0 -a -f log/hadd.stat ]; then
1137 statout=`cat log/hadd.stat`