|
[Rivet-svn] r2866 - in trunk: . binblackhole at projects.hepforge.org blackhole at projects.hepforge.orgThu Dec 30 12:51:07 GMT 2010
Author: buckley Date: Thu Dec 30 12:51:05 2010 New Revision: 2866 Log: Adding a run timeout option, inpired by crappy LCG Modified: trunk/ChangeLog trunk/bin/rivet Modified: trunk/ChangeLog ============================================================================== --- trunk/ChangeLog Wed Dec 29 23:47:06 2010 (r2865) +++ trunk/ChangeLog Thu Dec 30 12:51:05 2010 (r2866) @@ -1,3 +1,11 @@ +2010-12-30 Andy Buckley <andy at insectnation.org> + + * Adding a run timeout option, and small bug-fixes to the event + timeout handling, and making first event timeout work nicely with + the run timeout. Run timeout is intended to be used in conjunction + with timed batch token expiry, of the type that likes to make 0 + byte AIDA files on LCG when Grid proxies time out. + 2010-12-21 Andy Buckley <andy at insectnation.org> * Fix the cuts in the CDF 1994 colour coherence analysis. Modified: trunk/bin/rivet ============================================================================== --- trunk/bin/rivet Wed Dec 29 23:47:06 2010 (r2865) +++ trunk/bin/rivet Thu Dec 30 12:51:05 2010 (r2866) @@ -95,6 +95,11 @@ parser.add_option("--event-timeout", dest="EVENT_TIMEOUT", type="int", default=3600, metavar="NSECS", help="max time in whole seconds to wait for an event to be generated from the specified source (default = %default)") +parser.add_option("--run-timeout", dest="RUN_TIMEOUT", type="int", + default=None, metavar="NSECS", + help="max time in whole seconds to wait for the run to finish. This can be useful on batch systems such " + "as the LCG Grid where tokens expire on a fixed wall-clock and can render long Rivet runs unable to write " + "out the final histogram file (default = unlimited)") parser.add_option("--histo-interval", dest="HISTO_WRITE_INTERVAL", type=int, default=None, help="[experimental!] specify the number of events between histogram file updates. " "Default is to only write out at the end of the run. Note that intermediate histograms will be those " @@ -394,31 +399,35 @@ logging.info("Rivet running on machine %s (%s)" % (platform.node(), platform.machine())) +## Set up an event timeout handler +if opts.EVENT_TIMEOUT or opts.RUN_TIMEOUT: + def evttimeouthandler(signum, frame): + logging.warn("It has taken more than %d secs to get an event! Is the input event stream working?" % + min(opts.EVENT_TIMEOUT, opts.RUN_TIMEOUT)) + raise Exception("Event timeout") + signal.signal(signal.SIGALRM, evttimeouthandler) + + ## Init run based on one event hepmcfile = HEPMCFILES[0] -if opts.EVENT_TIMEOUT: - def evtinithandler(signum, frame): - logging.warn("It has taken more than %d secs to get the first event! Is the input event stream working?" % opts.EVENT_TIMEOUT) - raise Exception("Event initialisation timeout") - signal.signal(signal.SIGALRM, evtinithandler) - signal.alarm(opts.EVENT_TIMEOUT) try: + if opts.EVENT_TIMEOUT or opts.RUN_TIMEOUT: + signal.alarm(min(opts.EVENT_TIMEOUT, opts.RUN_TIMEOUT)) init_ok = run.init(hepmcfile) + signal.alarm(0) if not init_ok: - logging.error("Failed to initialise on event file %s... exiting" % hepmcfile) + logging.error("Failed to initialise using event file '%s'... exiting" % hepmcfile) sys.exit(2) except: - logging.error("Timeout in initialisation from event file %s... exiting" % hepmcfile) + logging.error("Timeout in initialisation from event file '%s'... exiting" % hepmcfile) sys.exit(3) -## Cancel timeout -signal.alarm(0) ## Event loop evtnum = 0 starttime = time.time() for fileidx, hepmcfile in enumerate(HEPMCFILES): - ## Open next HepMC file (does not apply to first file: it was already used for the run init) + ## Open next HepMC file (NB. this doesn't apply to the first file: it was already used for the run init) if fileidx > 0: run.openFile(hepmcfile) if not run.readEvent(): @@ -428,22 +437,39 @@ while opts.MAXEVTNUM is None or evtnum < opts.MAXEVTNUM: evtnum += 1 logNEvt(evtnum, starttime, opts.MAXEVTNUM) + ## Process this event processed_ok = run.processEvent() if not processed_ok: logging.warn("Event processing failed for evt #%i!" % evtnum) break + + ## Set flag to exit event loop if run timeout exceeded + if opts.RUN_TIMEOUT and (time.time() - starttime) > opts.RUN_TIMEOUT: + logging.warning("Run timeout of %d secs exceeded... exiting gracefully" % opts.RUN_TIMEOUT) + RECVD_KILL_SIGNAL = True + ## Exit the loop if signalled if RECVD_KILL_SIGNAL is not None: break - ## Read next event - read_ok = run.readEvent() - if not read_ok: - break + + ## Read next event (with timeout handling if requested) + try: + if opts.EVENT_TIMEOUT: + signal.alarm(opts.EVENT_TIMEOUT) + read_ok = run.readEvent() + signal.alarm(0) + if not read_ok: + break + except: + logging.error("Timeout in reading event from '%s'... exiting" % hepmcfile) + sys.exit(3) + ## Write a histo file snapshot if appropriate if opts.HISTO_WRITE_INTERVAL is not None: if evtnum % opts.HISTO_WRITE_INTERVAL == 0: ah.writeData(opts.HISTOFILE) + logging.info("Finished event loop") run.finalize()
More information about the Rivet-svn mailing list |