#
# @(#) $Id: Scheduler.py,v 1.58 2004/05/06 14:09:40 ivm Exp $
#
# $Author: ivm $
#
# $Log: Scheduler.py,v $
# Revision 1.58  2004/05/06 14:09:40  ivm
# Fixed MaxTimeRun conversion from string to float
#
# Revision 1.57  2003/08/27 17:01:43  ivm
# Reduce priority of inactive queues even if there is no priority
# window shift
#
# Revision 1.56  2003/08/20 18:58:58  ivm
# Implemented CPU power, round-robin-over-users scheduling inside queuei,
# other minor things.
#
# Revision 1.55  2003/01/14 22:07:10  ivm
# Implemented round-robin over users
#
# Revision 1.54  2001/11/20 19:42:15  ivm
# Implemented CPU and real time limits for proc. type
# Fixed launcher reconfiguration bug
#
# Revision 1.53  2001/11/06 20:12:02  ivm
# Gradully reduce priorities of inactive queues
# Corrected calculations in RM.Node.canAllocateResources
#
# Revision 1.51  2001/11/05 19:22:03  ivm
# Fixed some bugs
# Made inetractive spawner independent of UPS
#
# Revision 1.50  2001/10/24 18:34:37  ivm
# Implemented ProcType.max_nodes
# Fixed section priority incrementing
#
# Revision 1.49  2001/10/18 02:58:23  ivm
# Fixed bug in handling of new scratch disks in launcher.py
# Implemented runable/non-runable cache in Scheduler, Queue
#
# Revision 1.47  2001/09/25 18:41:38  ivm
# Changes for Python 2.1
#
# Revision 1.46  2001/08/23 19:05:54  ivm
# Implemented search with feedback RM algorithm
# Added -v option to status.py
# Fixed handling of time limits in FBSSectionInfo
#
# Revision 1.45  2001/05/11 14:04:04  ivm
# Use all queue priorities to cut queue list, not only the highest
#
# Revision 1.44  2001/03/15 21:47:55  ivm
# Implemented "on nodes"
# Fixed protocol version handling in lch, lchif
#
# Revision 1.43  2001/01/08 16:49:56  ivm
# Slower time unit change in Scheduler
# Implemented probing launcher -> bmgr
#
# Revision 1.42  2000/12/18 20:30:17  ivm
# Use flock in Launcher
# Put inactive queue priorities to min in Scheduler
#
# Revision 1.41  2000/12/18 17:48:08  ivm
# Improved format of history output
# Cleaned-up Scheduler code
#
# Revision 1.40  2000/12/12 21:02:17  ivm
# Code clean-up
#
# Revision 1.39  2000/11/30 20:23:18  ivm
# Fixed bugs
# Made Scheduler more conservative about unknown queues/ptypes
# Use /tmp/launcher.pid for launcher inter-locking
#
# Revision 1.38  2000/11/16 20:07:05  ivm
# v1_2_5
# Fixed None in getAvgTimeInUnits in Scd
#
# Revision 1.37  2000/11/15 15:19:31  ivm
# Use short_job as time unit is Scheduler
# Fix va_args in krb5module
# Added scripts/kill_logd.sh scripts/shutdown_logd.sh
#
# Revision 1.35  2000/11/14 19:09:47  ivm
# Fixed syntax errors
# Recover Username from UID in SectParam
#
# Revision 1.34  2000/11/13 21:44:20  ivm
# Implemented elapsed time tracking and invariance upon section size
#
# Revision 1.33  2000/11/08 20:27:02  ivm
# Implemented FBSClient.getServerOptions()
# Fixed bugs
#
# Revision 1.31  2000/11/01 16:06:08  ivm
# Made compile for SunOS
# Introduced "placement" parameter to allocateLocal methods
#
# Revision 1.30  2000/10/06 17:24:15  ivm
# Removed debug print-outs
#
# Revision 1.29  2000/09/27 20:38:01  ivm
# Inmplemented modifications suggested during the code review
#
# Revision 1.27  2000/09/22 18:32:18  ivm
# Fixed bugs found during code review
#
# Revision 1.26  2000/08/24 14:23:18  ivm
# Fixed bugs
# Shift queue priorities back to minimum
#
# Revision 1.25  2000/08/08 16:19:47  ivm
# Fixed user message sending from batch process to API
#
# Revision 1.24  2000/08/07 14:40:33  ivm
# Added extra parameter to RM.can*AllocateGlobal()
#
# Revision 1.23  2000/08/03 19:35:37  ivm
# Fixed some bugs
#
# Revision 1.22  2000/08/02 15:01:08  ivm
# Use new RM
#
# Revision 1.20  2000/07/03 22:40:19  ivm
# Fix typo in Scheduler
# Fix kill_all.sh
#
# Revision 1.19  2000/06/29 22:14:49  ivm
# *** empty log message ***
#
# Revision 1.18  2000/06/28 18:51:54  ivm
# Run scheduler from 1 to 6 times per minute
# Do not enable scheduler when a process exits (killJob cludge)
#
# Revision 1.17  2000/06/21 19:26:33  ivm
# Scheduler runs not more often than 2/minute
#
# Revision 1.16  2000/06/19 16:53:55  ivm
# Increment priority of all queues, not only higher priority
#
# Revision 1.13  2000/05/30 16:14:43  ivm
# Tested queue priorities
#
# Revision 1.12  2000/04/20 21:18:44  ivm
# Fixed inter-queue scheduling
#
# Revision 1.11  2000/04/17 15:48:56  ivm
# Made modifications for LogClient
#
# Revision 1.10  2000/04/06 15:14:36  ivm
# *** empty log message ***
#
# Revision 1.9  2000/03/30 21:05:03  ivm
# Removed debug print-outs
#
# Revision 1.8  2000/03/30 20:53:59  ivm
# Made default idle sleep time 1 minute
#
# Revision 1.7  2000/03/30 20:44:48  ivm
# Removed and added some printouts
#
# Revision 1.5  2000/03/09 20:19:24  ivm
# *** empty log message ***
#
# Revision 1.3  2000/02/15 16:51:24  ivm
# *** empty log message ***
#
# Revision 1.2  2000/01/31 21:39:44  ivm
# Fixed more minor bugs
#
#

import	bmgr_global
import	time

from Tracer import Tracer

class	Scheduler:
	# See FBSNG Scheduler for algorithms description.
	# See also Queue.py
	def __init__(self, cfg):
		# cfg is not used yet
		self.Enabled = 0
		self.LastRun = 0
		self.CanRun = 1
		#self.DefProcTime = cfg.getValue('bmgr','*','default_time',60*60)
		self.ShortJob = cfg.getValue('bmgr','*','short_job',10*60)
		self.MinTime = self.ShortJob
		self.MaxTime = self.ShortJob
		self.TimeUnit = self.ShortJob
		self.RsrcUnits = cfg.getValueDict('bmgr','*','rsrc_units')
		self.MaxRunTime = cfg.getValue('bmgr','*','max_sched_run',None)
		if self.MaxRunTime != None:
			self.MaxRunTime = float(self.MaxRunTime)
		#self.Tracer = Tracer()
		if self.RsrcUnits == None:
			self.RsrcUnits = {}
		
	def getAvgTimeInUnits(self, q, ptname):
		t = q.getAvgTime(ptname)
		if t == None:	t = self.maxTime() * 10
		tu = self.timeUnit()
		return t/tu

	def maxTime(self):
		if self.MaxTime == None:
			return self.timeUnit()
		return self.MaxTime
	
	def updateMaxTime(self, t):
		if t < self.ShortJob:	return
		if t > self.MaxTime:
			self.MaxTime = 0.9 * t + 0.1 * self.MaxTime
		else:
			self.MaxTime = 0.1 * t + 0.9 * self.MaxTime
		
	def minTime(self):
		if self.MinTime == None:
			return self.timeUnit()
		return self.MinTime
		
	def updateMinTime(self, t):
		if t < self.ShortJob:	return
		if t < self.MinTime:
			self.MinTime = 0.9 * t + 0.1 * self.MinTime
		else:
			self.MinTime = 0.1 * t + 0.9 * self.MinTime
		self.updateTimeUnit()

	def timeUnit(self):
		return self.TimeUnit

	def updateTimeUnit(self):
		self.TimeUnit = self.TimeUnit * 0.9 + self.minTime() * 0.1

	def procExit(self, sect, nodename):
		self.trigger()
		# return		
		qn = sect.Queue
		try:	q = bmgr_global.G_QueueFinder[qn]
		except: return
		runtime = time.time() - sect.StartTime
		cn = bmgr_global.G_ResourceManager.getClassOfNode(nodename)
		junk, junk, junk, power = bmgr_global.G_ResourceManager.getNodeClass(cn)
		normalized = runtime * power

		if normalized > self.ShortJob:
			self.updateMinTime(normalized)
			self.updateMaxTime(normalized)
			q.updateAvgTime(sect.ProcType, normalized)

		if sect.TimeEstimate != None:
			# TimeEstimate is normalized !
			decr = sect.PrioDecr
			unused = sect.TimeEstimate - normalized
			credit = float(decr) * unused / sect.TimeEstimate / sect.NProc - 1
			if credit < 0:	credit = int(credit-0.5)
			else:			credit = int(credit+0.5)
			q.incPrio(credit)

	def log(self, level, msg):
		bmgr_global.G_LogClient.log(level, 0, 'SCD: %s' % msg)
		
	def enable(self):
		self.Enabled = 1

	def trigger(self):
		self.CanRun = 1
		
	def _cmpQueues(self, qt1, qt2):
		q1, sl1 = qt1
		q2, sl2 = qt2
		dp = q1.Prio - q2.Prio
		if dp:	return -dp
		# compare time with 10ms precision
		return int((sl1[0].SubTime - sl2[0].SubTime)*100)

	def sectionWeight(self, sect):
		w = 0.0
		for rn, u in self.RsrcUnits.items():
			if sect.PerProcLocal.has_key(rn):
				r = sect.PerProcLocal[rn]
				if r:	w = w + float(r)/u * sect.NProc
			if sect.PerProcGlobal.has_key(rn):
				r = sect.PerProcGlobal[rn]
				if r:	w = w + float(r)/u * sect.NProc
			if sect.PerSectGlobal.has_key(rn):
				r = sect.PerSectGlobal[rn]
				if r:	w = w + float(r)/u
		if w == 0.0:
			w = sect.NProc
		return w
		
	def resetRunableCache(self):
		self.RunableCache = {}
		
	def run(self):
		#print 'SCD: run()...'
		# Run at least once per minute
		if time.time() > self.LastRun + 60:
			self.CanRun = 1
		if not self.Enabled or not self.CanRun: return
		# do not run more often than 6 times a minute
		if time.time() < self.LastRun + 5:
			return
		self.resetRunableCache()
		#self.Tracer.clear()
		#self.Tracer.start()
		self.CanRun = 0
		runAgain = 1
		runStart = time.time()
		while runAgain and \
				(self.MaxRunTime == None or time.time() < runStart + self.MaxRunTime):
			runAgain = 0
			activeList, inactiveList = self.makeQList()
			if not activeList:	break
					
			qlst = self.sortQList(activeList)

			# debug
			#msg = 'Run: prios: '
			#for q, sl in qlst:
			#	msg = msg + '%s:%d ' % (q.Name, q.Prio)
			#self.log('D', msg)

			qlst = self.cutQList(qlst)
			if not qlst:	break
			theQ, theSect = self.singleRun(qlst)
			if theQ == None: 	break
			time_estimate = self.getAvgTimeInUnits(theQ, theSect.ProcType) * 2
			decr = theQ.QPDec * self.sectionWeight(theSect) * time_estimate
			theSect.TimeEstimate = time_estimate * self.timeUnit()
			decr = int(decr + 0.5)

			#print 'Avg %s.%s = %s' % (
        	#		theQ.Name, theSect.ProcType,
        	#		theQ.getAvgTime(theSect.ProcType))
			#print 'Time min/max/unit = %s/%s/%s' % (self.minTime(),
			#	self.maxTime(), self.timeUnit())
			#print 'Decrementing queue %s by %s*%s*%s = %s' % (
        	#		theQ.Name,
        	#		theQ.QPDec, theSect.NProc,
        	#		self.getAvgTimeInUnits(theQ, theSect.ProcType),
        	#		decr)
			#self.log('D', 'Run: started from #%d: %s' % (inx, theQ.Name))

			for q in inactiveList:
				q.incPrio(-decr)

			shift = max(0, theQ.MinQPrio - (theQ.Prio - decr))
			thisDecrement = -theQ.incPrio(-decr)
			maxIncrement = 0
			if shift > 0:
				for q, sl in activeList:
					if not q is theQ:
						maxIncrement = max(maxIncrement, q.incPrio(shift))
			decr = maxIncrement + thisDecrement
			theSect.PrioDecr = max(0, decr - theSect.NProc)
				
			runAgain = 1
		#self.Tracer.trace('run')
		#lastt = 0
		#for t, p in self.Tracer.dump():
		#	self.log('T', 'Tracer: %s %s %s' % (t, t-lastt, p))
		#	lastt = t
		self.CanRun = runAgain
		self.LastRun = time.time()

	def cutQList(self, qlst):
		lst1 = []
		for theq, thelst in qlst:
			thep = theq.Prio
			include = 1
			for q,l in qlst:
				if q.Prio - q.QPGap > thep:
					include = 0
					break
			if include:
				lst1.append((theq, thelst))
		return lst1

	def sortQList(self, qlist):
		qlst = qlist[:]
		qlst.sort(self._cmpQueues)
		return qlst

	def makeQList(self):
		qlst = []
		iqlst = []
		for qn in bmgr_global.G_QueueFinder.queues():
			q = bmgr_global.G_QueueFinder[qn]
			active = 0
			slst = []
			if not q.isHeld():
				slst = q.schedList()
				active = len(slst) > 0
			if active:
				qlst.append((q, slst))
			else:
				iqlst.append(q)
		# randomize the list
		if qlst:
			i = int(time.time()) % len(qlst)
			qlst = qlst[i:] + qlst[:i]
		return qlst, iqlst

	def remFromQList(self, qlst, que):
		j = -1
		for i in range(len(qlst)):
			q, sl = qlst[i]
			if que is q:
				j = i
				break
		if j >= 0:
			return qlst[:j] + qlst[j+1:]
		else:
			return qlst
		
	def runCacheKey(self, s):
		greq = s.PerProcGlobal
		lreq = s.PerProcLocal
		gsreq = s.PerSectGlobal
		np = s.NProc
		pt = s.ProcType
		placement = s.Placement
		onNodes = s.OnNodes
		if not onNodes: onNodes = None
		return repr((pt, np, lreq, greq, gsreq, placement, onNodes))

	def sectionIsRunable(self, s):
		key = self.runCacheKey(s)
		if self.RunableCache.has_key(key):
			return self.RunableCache[key]
		return 1

	def unrunableSection(self, s):
		key = self.runCacheKey(s)
		self.RunableCache[key] = 0

	def singleRun(self, qlst):
		#print 'SCD: singleRun()...'
		#print 'SCD: %d queues with ready sections' % len(qlst)
		for q, slist in qlst:
			#print 'SCD: Queue %s(%d):' % (q.Name, len(slist))
			#for s in slist:
			#	print	'%s(%d) ' % (s.ID, s.Prio),
			#print ''
			i = -1
			started = None
			for s in slist:
				i = i + 1
				if not self.sectionIsRunable(s):
					continue
				gsreq = s.PerSectGlobal
				lpreq = s.PerProcLocal
				gpreq = s.PerProcGlobal
				np = s.NProc
				pt = s.ProcType
				placement = s.Placement
				onNodes = s.OnNodes
				if not onNodes: onNodes = None
				loclst = bmgr_global.G_ResourceManager.allocateLocal(
							np, lpreq, pt, placement, onNodes = onNodes)
				if loclst == None:
					self.unrunableSection(s)
					continue
				sectrpur, globlst = bmgr_global.G_ResourceManager.allocateGlobal(
							np, gpreq, pt, gsreq)
				if sectrpur == None:
					self.unrunableSection(s)
					continue
				# update section priorities
				if s.start(sectrpur, globlst, loclst) == 0:
					self.log('I','Started section %s' % s.ID)
					started = s.ID
					break
				else:
					self.log('E','Failed to start section %s' % s.ID)						
			if started:
				for s in q.schedList(getAll=1):
					if s.ID == started: 		
						q.rotateUsers(s.Username)
						return q, s
					if s.state() == 'ready':	q.incSectPrio(s)
		#print 'SCD: singleRun(): returning None'
		return None, None

