Project

General

Profile

scontrol.py

Martin Kuemmel, 05/11/2022 09:52 AM

Download (9.97 KB)

 
1
#!/usr/bin/env python
2
'''
3
Created on March 23rd 2016
4

5
@author: mkuemmel@usm.lmu.de
6

7
Date:     $Date: 2018-05-24 10:25:02 +0200 (Thu, 24 May 2018) $
8
Revision: $Revision: 340 $
9
Author:   $Author: Martin.Kuemmel $
10
'''
11
import os.path
12
import subprocess
13

    
14
def theNodeStatus():
15
    # run the 'sinfo' command
16
    # to list all nodes (no quotes necessary for format string)
17
    comSeq = ['sinfo', '-p', 'usm-cl', '-o', '%n  %t %e %m %O %C']
18
    allReturns = execCommand('sinfo', comSeq, prompt='execCommand>')
19
    if allReturns['retcode']:
20
        errMsg = 'Command: "%s" finished with exit code: %i!' % (comSeq, allReturns['retcode'])
21
        raise Exception(errMsg)
22

    
23
    # read in the 
24
    allNodes = {}
25
    lineIndex=0
26
    for aLine in open(allReturns['stdout']):
27
        lineIndex += 1
28
        
29
        # skip the first line
30
        if lineIndex < 2:
31
            continue
32

    
33
        # isolate the items
34
        lineItems = aLine.strip().split()
35
        
36
        # get key and state
37
        key     = lineItems[0]
38
        state   = lineItems[1]
39

    
40
        # get the other info        
41
        if state != 'down':
42
            fracMem = 100.0*float(lineItems[2])/float(lineItems[3])
43
            fmStr   = '%4.0f%s' % (fracMem, '%')
44
            load    = float(lineItems[4])
45
            cpuUsed = '%s/%s'%(lineItems[5].split("/")[0],lineItems[5].split("/")[-1])
46
        else:
47
            fracMem = '---'
48
            fmStr   = '---'
49
            load    = '---'
50
            cpuUsed = '---'
51
        
52
        # append the info to the keylist
53
        allNodes[key] = {'state': state, 'load': load, 'mem': fmStr, 'cpu': cpuUsed}
54

    
55
    # destroy the output files
56
    os.unlink(allReturns['stdout'])
57
    os.unlink(allReturns['stderr'])
58

    
59
    # return the keys
60
    return allNodes
61

    
62
def execCommand(command, commSequence, prompt='execCommand>', verbose=True):
63
    """
64
    """
65
    # make stdout and stderr files ready
66
    stdoutFile = '%s.out' % command
67
    stderrFile = '%s.err' % command
68
    if os.path.isfile(stdoutFile):
69
        os.unlink(stdoutFile)
70
    if os.path.isfile(stderrFile):
71
        os.unlink(stderrFile)
72
    sout = open(stdoutFile, "w+")
73
    serr = open(stderrFile, "w+")
74

    
75
    # print the entire command
76
    if verbose:
77
        allCommand = ' '.join(commSequence)
78
        serr.write(allCommand+'\n')
79
        serr.flush()
80

    
81
    # run the command
82
    retcode = subprocess.call(commSequence, shell=False, env=None, stdout=sout, stderr=serr )
83

    
84
    # close stdout and stderr
85
    sout.close()
86
    serr.close()
87

    
88
    # give back the return
89
    return {'retcode': retcode, 'stdout': stdoutFile, 'stderr': stderrFile}
90

    
91
def getScontrolList():
92
    """
93
    """
94
    # run the 'scontrol' command
95
    # to list all jobs
96
    comSeq = ['scontrol', 'show', 'job']
97
    allReturns = execCommand('scontrol', comSeq, prompt='execCommand>')
98
    if allReturns['retcode']:
99
        errMsg = 'Command: "%s" finished with exit code: %i!' % (comSeq, allReturns['retcode'])
100
        raise Exception(errMsg)
101

    
102
    # go over all lines
103
    # of the output files
104
    nmRunJobs  = []
105
    nmPendJobs = []
106
    dbRunJobs  = []
107
    dbPendJobs = []
108
    oneJob=None
109
    for aLine in open(allReturns['stdout']):
110

    
111
        # identify a new job;
112
        # append the old one, open a new dict
113
        if aLine.find('JobId')>-1:
114
            if oneJob:
115
                if oneJob['JobState']=='RUNNING':
116
                    if oneJob['Partition'] == 'usm-cl':
117
                        dbRunJobs.append(oneJob)
118
                    else:
119
                        nmRunJobs.append(oneJob)
120
                elif oneJob['JobState']=='PENDING':
121
                    if oneJob['Partition'] == 'usm-cl':
122
                        dbPendJobs.append(oneJob)
123
                    else:
124
                        nmPendJobs.append(oneJob)
125
            oneJob={}
126

    
127
        if aLine.find(',') > -1:
128
            # split line according to ','
129
            lineParts = aLine.strip().split(',')
130
        else:
131
            # split the line according to blanks
132
            lineParts = aLine.strip().split()
133
        
134
        # go over the parts
135
        for aPart in lineParts:
136
            # identify keywords and append to the dict
137
            jobParts=aPart.split('=')
138
            if len(jobParts) < 2:
139
                continue
140
            oneJob[jobParts[0]] = jobParts[1]
141
    
142
    # append the last job
143
    if oneJob:
144
        if oneJob['JobState']=='RUNNING':
145
            if oneJob['Partition'] == 'usm-cl':
146
                dbRunJobs.append(oneJob)
147
            else:
148
                nmRunJobs.append(oneJob)
149
        elif oneJob['JobState']=='PENDING':
150
            if oneJob['Partition'] == 'usm-cl':
151
                dbPendJobs.append(oneJob)
152
            else:
153
                nmPendJobs.append(oneJob)
154

    
155
    # destroy the output files
156
    os.unlink(allReturns['stdout'])
157
    os.unlink(allReturns['stderr'])
158
    
159
    # return the various job lists
160
    #return {'normalRun': nmRunJobs, 'normalPend' :nmPendJobs, 'lowpriRun': dbRunJobs, 'lowpriPend': dbPendJobs}
161
    return {'usm-clRun': dbRunJobs, 'usm-clPend': dbPendJobs}
162

    
163
def getUserInfo(jobLists):
164
    """
165
    """
166
    # go over all lists
167
    allLists = {}
168
    for aJob in jobLists:
169

    
170
        # skip empty lists
171
        if len(jobLists[aJob]) < 1:
172
            continue
173

    
174
        # make a dict for the users    
175
        allUsers = {}
176
        for oneJob in jobLists[aJob]:
177
            # try getting the number of CPU's;
178
            # get the memory information
179
            # define the user ID accordingly
180
            try:
181
                # get ncpu and user=ID
182
                nCpu = int(oneJob['NumCPUs'])
183
                userId = oneJob['UserId']
184
                # get the memory information, which is different
185
                # for pending and running jobs
186
                if 'mem' in oneJob:
187
                    if oneJob['mem'][-1] == 'G' or oneJob['mem'][-1] == 'M':
188
                        userMem = float(oneJob['mem'][:-1])
189
                        unitMem = oneJob['mem'][-1]
190
                    else:
191
                        userMem = float(oneJob['mem'])
192
                        unitMem = 'M'
193
                elif 'MinMemoryNode' in oneJob:
194
                    userMem = float(oneJob['MinMemoryNode'][:-1])
195
                    unitMem = oneJob['MinMemoryNode'][-1]
196
                elif 'MinMemoryCPU' in oneJob:
197
                    userMem = float(oneJob['MinMemoryCPU'][:-1])
198
                    unitMem = oneJob['MinMemoryCPU'][-1]
199
                # convert to Gb if necessary
200
                if unitMem == 'M':
201
                    userMem /= 1000.0
202
                hasNCpu = True
203
            except ValueError:
204
                nCpu = None 
205
                userId = '%sS' % oneJob['UserId']
206
                hasNCpu = False
207

    
208
            # append the job to an ID
209
            # or create a new ID
210
            if not userId in allUsers:
211
                if hasNCpu:
212
                    allUsers[userId] = {'hasNCpu': True, 'njobs': 1, 'ncpu': nCpu, 'memsize': userMem, 'jobids': [oneJob['JobId']]}
213
                else:
214
                    allUsers[userId] = {'hasNCpu': False,'njobs': 1, 'ncpu': oneJob['NumCPUs'], 'memsize': oneJob['MinMemoryNode'], 'jobids': [oneJob['JobId']]}
215
            else:
216
                if hasNCpu:
217
                    allUsers[userId]['njobs']   += 1
218
                    allUsers[userId]['ncpu']    += nCpu
219
                    allUsers[userId]['memsize'] += userMem
220
                    allUsers[userId]['jobids'].append(oneJob['JobId'])
221
                else:
222
                    allUsers[userId]['njobs']   += 1
223
                    allUsers[userId]['ncpu']    += '%,'%str(oneJob['NumCPUs'])
224
                    allUsers[userId]['memsize'] += '%,'%str(oneJob['MinMemoryNode'])
225
                    allUsers[userId]['jobids'].append(oneJob['JobId'])
226

    
227
        # append the list the combined list
228
        allLists[aJob] = allUsers
229

    
230
    # return the combined list
231
    return allLists
232

    
233
def printUserJobList(userJobLists):
234
    """
235
    """
236
    # go over all queues
237
    for aList in userJobLists:
238

    
239
        # print all users for a queue
240
        for oneUser in userJobLists[aList]:
241
            if userJobLists[aList][oneUser]['hasNCpu']:
242
                print('%10s> %-15s njobs: %3i ncpus: %4i memory: %6.1fG' % (aList, oneUser, userJobLists[aList][oneUser]['njobs'], userJobLists[aList][oneUser]['ncpu'], userJobLists[aList][oneUser]['memsize']))
243
            else:
244
                print('%10s> %-15s njobs: %3i ncpus: %s memory: %s' % (aList, oneUser, userJobLists[aList][oneUser]['njobs'], userJobLists[aList][oneUser]['ncpu'], userJobLists[aList][oneUser]['memsize']))
245
        # print a separator
246
        print('')
247

    
248
def doScontrol():
249
    """
250
    """
251
    
252
    # get the job on all queues via 'scontrol'
253
    jobLists = getScontrolList()
254

    
255
    # get the user information for all job lists
256
    userJobLists = getUserInfo(jobLists)
257

    
258
    # print the information
259
    printUserJobList(userJobLists)
260

    
261
def checkNodes():
262
    """
263
    """
264
    # the list of nodes to check for
265
    checkNodes=['usm-cl-bt01n1', 'usm-cl-bt01n2', 'usm-cl-bt01n3', 'usm-cl-bt01n4', 'usm-cl-bt02n1', 'usm-cl-bt02n2', 'usm-cl-bt02n3', 'usm-cl-bt02n4']
266

    
267
    allNodes = theNodeStatus()
268
    print("\n\
269
+-----------------------------------------------+\n\
270
+      USM cluster at the physics depart.       +\n\
271
+-----------------------------------------------+")
272
    print(str("%15s%8s%8s%8s%8s |" %
273
              ("Node:     ","Status","Slots"," Mem ","Load")))
274
    print(str("%15s%8s%8s%8s%8s |" %
275
              ("----------","------", "-----"," --- ","----")))
276
    numTot  = 0
277
    numUsed = 0
278
    for oneNode in checkNodes:
279
        if oneNode in allNodes:
280
            print(str("%15s%8s%8s%8s%8s |" % (oneNode.ljust(10),allNodes[oneNode]['state'],allNodes[oneNode]['cpu'],allNodes[oneNode]['mem'],allNodes[oneNode]['load'])))
281
            if allNodes[oneNode]['cpu'].find('/') > -1:
282
                procInfo = allNodes[oneNode]['cpu'].split('/')
283
                numUsed += int(procInfo[0])
284
                numTot  += int(procInfo[1])
285
    print('CPUs used/total:   % 3i/%03i                |' % (numUsed, numTot))
286
    print("+-----------------------------------------------+\n")
287

    
288
def main():
289

    
290
    checkNodes()
291
    doScontrol()
292

    
293
if __name__ == '__main__':
294
    main()
Redmine Appliance - Powered by TurnKey Linux