Source code for viewclust.slurm.sacct_jobs

from io import StringIO
import subprocess
import pandas as pd
import os

# Time columns in job records
# If we exclude PENDING jobs (that we do in slurm_raw_processing), all time columns should have a time stamp,
# except RUNNING jobs that do not have the 'End' stamp.
time_columns = ['Eligible','Submit','Start','End']

# Define what constitutes a duplicate job
duplicate_job_def = ['JobID','Submit','Start']


[docs]def sacct_jobs(account_query, d_from, d_to='', debugging=False,
               serialize_frame='', slurm_names=False):
    """Ingest job record information from slurm via sacct and return DataFrame.

    Parameters
    -------
    account_query: str
        String query to be sent to sacct via -A flag.
    d_from: date str
        Beginning of the query period, e.g. '2019-04-01T00:00:00
    debugging: boolean, optional
        Boolean for reporting progress to stdout. Default False.
    sacct_file: str, optional
        Loads a raw query from file.
        If empty, query is rerun. Defaults to the empty string.
    serialize_frame: str, optional
        Pickle the resulting DataFrame.
        If empty, pickling is skipped. Defaults to the empty string.
    slurm_names: str, optional
        Keep slurm's sacct column names instead of shorthands.
        Defaults to False.

    Returns
    -------
    DataFrame
        Returns a standard pandas DataFrame, or an empty dataframe if no
        jobs are found.
    """

    raw_frame = _get_slurm_records(pd.to_datetime(d_from))
    out_frame = _slurm_raw_processing(raw_frame, slurm_names)

    # Legacy/consistency check:
    # Protect end time for jobs that are still currently running
    out_frame['end'] = out_frame['end'].replace({pd.NaT: pd.to_datetime(d_to)})

    # return _slurm_consistency_check(out_frame) if debugging else out_frame
    # TODO: consisder swapping this to a better format
    if serialize_frame != '':
        out_frame.to_pickle(serialize_frame, protocol=4)
    return out_frame


def _get_slurm_records(arg, ssh_client=None):
    '''Retrieve records either via SSH or from a file.'''

    sacct_format = 'Account,AllocCPUS,AllocNodes,AllocTRES,AssocID,Cluster,CPUTimeRAW,'\
    'CPUTime,DerivedExitCode,ElapsedRaw,Elapsed,Eligible,End,ExitCode,Flags,GID,Group,'\
    'JobID,JobIDRaw,NCPUS,NNodes,NodeList,Priority,Partition,QOS,QOSRAW,Reason,ReqCPUS,'\
    'ReqMem,ReqNodes,ReqTRES,Reserved,ResvCPURAW,ResvCPU,Start,State,Submit,Suspended,'\
    'SystemCPU,TimelimitRaw,Timelimit,TotalCPU,UID,User,UserCPU,WorkDir'
    sacct_command = 'TZ=UTC sacct'
    sacct_options = f'--duplicates --allusers --allocations --parsable2 --delimiter=";" --format={sacct_format}'

    if isinstance(arg, str):
        # Read a SLURM dump from a file
        source = arg
        command = None
        if not os.path.isfile(source):
            print('The seed file does not exist. Quitting.')
            return pd.DataFrame()
    elif isinstance(arg, list) and arg:
        # Get specific jobs
        command = f'{sacct_command} {sacct_options} --jobs {",".join(arg)}'
    elif isinstance(arg, pd.Timestamp):
        # Get a list of jobs in a date range
        # Note that --start selects jobs in ANY state after the specified time.
        # This is not the same as filtering by 'Start' afterwards.
        command = f'{sacct_command} {sacct_options} --start {arg:%Y-%m-%dT%H:%M} --end Now\n'
    else:
        print('Unexpected input parameter to get_slurm_records().')
        return pd.DataFrame()

    if command:
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
        stdout, stderr = process.communicate()
        source = stdout.decode('UTF-8')

    try:
        records = pd.read_csv(StringIO(source), sep=';', dtype='str', on_bad_lines='skip')
    except e:  # TODO: Fix this to be less heavy handed
        return pd.DataFrame()

    return pd.DataFrame() if records.empty else records


def _slurm_raw_processing(records, slurm_names):

    check = records.duplicated( keep=False )
    if check.any():
        duplicated_records = records.loc[ check, 'JobID'].unique().tolist()
        len1 = len(records)
        records.drop_duplicates( keep='last', inplace=True, ignore_index=True )
        len2 = len(records)
        print(f'Dropped {len1-len2} fully identical records.')

    # Convert date/times columns from 'str' to the 'datetime' type.
    # Invalid parsing will be set to NaN.
    records[time_columns] = records[time_columns].apply( pd.to_datetime, errors='coerce' )

    # Convert integer columns from 'str' to 'int64'
    # Invalid parsing will be set to NaN and then to 0
    columns_int = ['AllocCPUS', 'AllocNodes', 'AssocID', 'CPUTimeRAW', 'ElapsedRaw', 'GID', 'JobIDRaw',
    'NCPUS', 'NNodes', 'Priority', 'QOSRAW', 'ReqCPUS', 'ReqNodes', 'ResvCPURAW', 'TimelimitRaw', 'UID']
    records[columns_int] = records[columns_int].apply( pd.to_numeric, errors='coerce' ).fillna(0).astype('Int64')

    # Replace unnecessary columns
    records['Timelimit'] = records['TimelimitRaw']
    records['CPUTime'] = records['CPUTimeRAW']
    records['Elapsed'] = records['ElapsedRaw']
    records['ResvCPU'] = records['ResvCPURAW']
    records.drop( columns=['TimelimitRaw','CPUTimeRAW','ElapsedRaw','ResvCPURAW'], inplace=True )

    # Allocated memory per job. Note that memory can be specified as a float in the submission script,
    # therefore we preserve this type for multiplication, but then cast to integer.
    records[['Mem','_mem_unit']] = records['AllocTRES'].str.extract('mem=([0-9.]+)(M|G|T)')
    records['Mem'] = pd.to_numeric( records['Mem'], errors='coerce').fillna(0).astype('float64')
    records['Mem'].mask( records['_mem_unit']=='G', records['Mem']*1024, inplace=True )
    records['Mem'].mask( records['_mem_unit']=='T', records['Mem']*1024*1024, inplace=True )
    records['Mem'] = records['Mem'].round(0).astype('Int64')
    records['MemTime'] = records['Mem']*records['Elapsed']
    records.drop( columns=['_mem_unit'], inplace=True )

    # GPUs: Get a number of allocated GPUs and GPU-seconds
    records['NGPUS'] = records['AllocTRES'].str.extract('gpu=(\d+)',expand=False)
    records['NGPUS'] = pd.to_numeric( records['NGPUS'], errors='coerce' ).fillna(0).astype('Int64')
    records['GPUTime'] = records['NGPUS']*records['Elapsed']

    if not slurm_names:
        old_fields = ['jobid', 'user', 'account', 'submit', 'start', 'end', 'ncpus', 'nnodes',
        'reqmem', 'timelimit', 'state', 'reqtres', 'reqtres', 'priority',
        'partition', 'reqcpus', 'mem', 'ngpus', 'alloctres']

        records.columns = records.columns.str.lower()
        records = records.drop(records.columns.difference(old_fields), 1)

    return records

def _slurm_consistency_check( records ):
    '''
    Perform consistency checks of the SLURM records.
    '''
    print('Consistency check started.')

    # Exclude running and pending jobs from analysis
    states = ['RUNNING','PENDING']
    check = records['State'].isin( states )
    if any(check):
        print(f'  {sum(check)} records of jobs in {states} states have been excluded from the consistency check.')
        records = records[ ~check ]

    # Runaway jobs
    # Some 'FAILED' records might have NaN in 'End' due to SLURM glitches. These are called runaway jobs.
    # They can be fixed by running 'sacctmgr show RunawayJobs' on the cluster.
    # We also check all other time columns just in case.
    check = records[ time_columns ].isna().any(1)
    if any(check):
        print(f'  NaNs detected in columns {time_columns} in the following {sum(check)} records that have been excluded: {records.loc[check,"JobID"].to_list()}')
        records = records[ ~check ]

    # Data consistency checks
    # Verify that 'End'-'Start' is equal to 'Elapsed'
    check = ( (records['End']-records['Start']).dt.total_seconds().astype('int64') - records['Elapsed'] )!=0
    if any(check):
        print(f'  Failed consistency check for Elapsed on the following {sum(check)} JobIDs:', records.loc[check,'JobID'].to_list() )

    # Verify that 'NCPUS'*'Elapsed' is equal to 'CPUTime'
    check = ( records['NCPUS']*records['Elapsed'] - records['CPUTime'] )!=0
    if any(check):
        print(f'  Failed consistency check for CPUTime on the following {sum(check)} JobIDs:', records.loc[check,'JobID'].to_list() )

    # Verify that 'AllocCPUS' and 'NCPUS' are the same (per SLURM documentation).
    check = ( records['AllocCPUS']!=records['NCPUS'] )
    if any(check):
        print(f'  Failed consistency check for AllocCPUS and NCPUS on the following {sum(check)} JobIDs:', records.loc[check,'JobID'].to_list() )

    print('Consistency check ended.')

    return records
Source code for viewclust.slurm.sacct_jobs

ViewClust

Navigation

Related Topics