ACCRE R9 Cluster Quick and Dirty Status

Report generated at Sun Jul 6 04:23:01 AM CDT 2025

Problem Nodes

HOSTNAMES      STATE      AVAIL_FEATURES                 TIMESTAMP            USER       REASON                                              
cn0001         drained    x86-64v3                       2025-05-22T15:29:13  appelte1   Sam - NA - Hammerspace testing, decom when done     
cn1368         drained    haswell                        2025-06-23T11:00:58  root       Nobody - RT93145 - Bad DIMM                         
cn1376         drained*   haswell                        2025-06-25T16:11:13  broadrt    Troy - RT90960 - dimm swapped and ext diags all pass
cn1379         drained*   haswell                        2025-05-23T17:27:09  root       Nobody - RT90691 - check memory instability         
cn1398         drained*   haswell                        2025-05-22T18:46:57  root       Provisioning - Nobody - R9 and FIX RT92064 or DECOM 
cn1413         drained*   haswell,sbcstmp                2025-03-25T16:59:30  ninchrsc   Samuel - Bad - Decom for part                       
cn1436         drained    skylake                        2025-04-05T11:18:12  root       Nobody - RT91528 - internal NIC is flapping         
cn1514         drained    skylake                        2025-03-17T09:07:25  slurm      Nobody - RT91150 - Network link flapping : Not respo
cn1521         drained*   skylake                        2025-04-05T08:07:35  root       Nobody - RT91525 - bad dimm                         
cn1529         down*      skylake                        2025-07-04T01:21:50  slurm      Not responding                                      
cn1539         drained    cascadelake                    2025-02-05T16:04:39  root       Alan - RT N/A - NO TOUCHY?                          
cn1541         drained*   cascadelake                    2025-02-28T16:01:10  root       Samuel - Bad - Decom for parts                      
cn1544         draining   cascadelake                    2025-07-06T02:03:25  root       Kill task failed                                    
cn1561         drained    cascadelake                    2025-06-06T16:10:06  root       Nobody - TBD - NVME drive IO issue, reimage with SAT
cn1562         drained*   cascadelake                    2025-06-12T09:41:46  broadrt    Troy - RT - nvme-out Sata-moved to alternate bay - r
cn1566         drained*   cascadelake                    2025-06-23T13:19:31  broadrt    Troy - Has SSD - R9 imagaing fails bad drive        
cn1570         drained*   cascadelake                    2025-06-18T13:30:41  broadrt    Troy - T/S SSD in bay1, nvme removed, verified fiber
cn1572         drained*   cascadelake                    2025-04-18T07:59:07  root       Nobody - RT91817 - Processor System Errors          
cn1573         down*      cascadelake                    2025-07-04T05:28:33  slurm      Not responding                                      
cn1574         drained*   cascadelake                    2025-06-18T13:26:58  broadrt    Troy - T/S SSD in bay1, nvme removed, verified fiber
cn1575         drained*   cascadelake                    2025-06-06T16:24:59  broadrt    Nobody - TBD - Will need to be replaced with SATA   
cn1578         drained*   cascadelake                    2025-06-04T10:30:19  broadrt     Eric - NVME swapped to SSD - Set aside for testing 
cn1582         drained    cascadelake                    2025-07-04T18:59:22  slurm      Prolog error                                        
cn1585         drained*   cascadelake                    2025-06-06T16:26:52  broadrt    Nobody - TBD - Will need to be replaced with SATA   
cn1587         drained*   cascadelake                    2025-06-06T18:55:05  broadrt    Nobody - TBD - lots of issues in checkmk may need mo
cn1611         drained    zen                            2025-02-28T10:53:50  root       Nobody - RT90770 - system instability causing reboot
cn2000         drained    zen                            2025-07-05T19:14:36  slurm      Prolog error                                        
gpu0014        drained*   pascal,p3584,intel_e5-2623_v4, 2025-06-20T12:45:52  root       Nobody - RT93119 - Unreachable via IPMI             
gpu0016        drained*   pascal,p3584,intel_e5-2623_v4, 2025-04-15T21:08:41  root       Nobody - RT91762 - possible bad GPU                 
gpu0025        drained*   pascal,p3840,intel_e5-2623_v4, 2025-06-20T15:00:51  slurm      Nobody - RT93118 - missing system memory : Not respo
gpu0030        drained*   pascal,p3840,intel_e5-2623_v4, 2025-06-20T18:19:40  appelte1   Nobody - RT93127 - possible bad link                
gpu0036        drained*   turing,intel_5118,skylake,inte 2025-04-15T21:06:27  root       Nobody - RT91761 - System instability, voltage probl
gpu0037        drained    turing,intel_5118,skylake,inte 2025-04-05T07:50:03  root       Nobody - RT91524 - frequent reboots                 
gpu0038        drained*   turing,intel_5118,skylake,inte 2025-05-19T14:53:48  appelte1   Provisioning - Nobody - resume when R9 and FIXED, se
gpu0040        drained*   turing,intel_5118,skylake,inte 2025-06-10T12:56:32  slurm      Nobody - RT91523 - Missing memory, likely bad DIMM :
gpu0041        drained*   turing,intel_5118,skylake,inte 2025-06-06T16:23:50  slurm      Nobody - RT86927 - Memory and GPU issues : Not respo
gpu0044        drained*   turing,intel_5118,skylake,inte 2025-04-05T07:46:42  root       Nobody - 91494 - Failed GPU crashing kernel         
gpu0045        drained*   turing,intel_5118,skylake,inte 2025-05-29T09:19:52  slurm      Provisioning - Nobody - resume when R9 and FIXED, se
gpu0048        drained*   turing,intel_5118,skylake,inte 2025-05-19T14:54:38  appelte1   Provisioning - Nobody - resume when R9 and FIXED, se
gpu0057        drained*   cascadelake,turing             2025-04-05T07:40:48  root       Nobody - RT91522 - unstable system                  
gpu0060        drained*   zen3,a4000x8                   2025-04-13T09:35:22  root       Nobody - RT91685 - internal interface firmware issue
gpu0061        drained*   zen3,a4000x8                   2025-06-06T16:36:34  broadrt    Nobody - RT92893 - firmware error detected          
gpu0081        drained*   zen3,a6000x4                   2025-05-19T14:55:26  appelte1   Provisioning - Nobody - resume when R9 and GREEN    
gpu0082        drained*   zen3,a6000x4                   2025-05-19T14:55:26  appelte1   Provisioning - Nobody - resume when R9 and GREEN    
gpu0083        drained*   zen,a100                       2025-06-03T12:39:33  root       Scott - RT90395 - bad hardware Node RMAd            
gpu0300        inval      zen,a6000                      2025-07-02T19:16:28  slurm      gres/gpu count reported lower than configured (0 < 8
hgx01          inval      zen,h100                       2025-07-03T14:27:41  slurm      gres/gpu count reported lower than configured (7 < 8

Queue Summary (Batch)

GROUP        USER                  ACTIVE_JOBS  ACTIVE_CORES  PENDING_JOBS  PENDING_CORES
-----------------------------------------------------------------------------------------
beam_lab                               1           16             0             0
            zhuj29                     1           16             0             0
-----------------------------------------------------------------------------------------
behringer_lab                          4          128             0             0
            stonecj                    4          128             0             0
-----------------------------------------------------------------------------------------
biostat_faculty                        2           40             0             0
            xiangq2                    2           40             0             0
-----------------------------------------------------------------------------------------
booth_lab                              2            5             0             0
            chenh55                    1            1             0             0
            mathura                    1            4             0             0
-----------------------------------------------------------------------------------------
brg_cores                              4           64             0             0
            kandelr                    2           32             0             0
            xuy33                      2           32             0             0
-----------------------------------------------------------------------------------------
cms                                  237         2785           606          1548
            cmslocal                 184         1389           296           767
            cmspilot                  33         1364           306           777
            uscmslocal                20           32             4             4
-----------------------------------------------------------------------------------------
coxlab                                 0            0             9            72
            pettyas1                   0            0             9            72
-----------------------------------------------------------------------------------------
csb_sanders                           33         1320           137          5480
            lig7                      33         1320           137          5480
-----------------------------------------------------------------------------------------
davis_lab                              2            6             0             0
            nitinr                     1            2             0             0
            salerl1                    1            4             0             0
-----------------------------------------------------------------------------------------
econgrads                              1            1             0             0
            chenl40                    1            1             0             0
-----------------------------------------------------------------------------------------
g_benntor_lab                          1            2             0             0
            mccorcl1                   1            2             0             0
-----------------------------------------------------------------------------------------
g_gamazon_lab                         11           77            85            85
            bettimj                   11           77            85            85
-----------------------------------------------------------------------------------------
h_biostat_kang                         1            1             0             0
            yanb1                      1            1             0             0
-----------------------------------------------------------------------------------------
h_biostat_student                    312          312            60            60
            namy1                    312          312            60            60
-----------------------------------------------------------------------------------------
h_lu_lab                               1           10             1            10
            parkj71                    1           10             1            10
-----------------------------------------------------------------------------------------
h_vmac                                59           59          8658          8658
            lorenzas                  59           59          8658          8658
-----------------------------------------------------------------------------------------
jswhep                                 9           72             0             0
            atehort                    9           72             0             0
-----------------------------------------------------------------------------------------
l2_jan_lab                             1            8             0             0
            janta3                     1            8             0             0
-----------------------------------------------------------------------------------------
l3_aboud_lab                           0            0            94           752
            koirap1                    0            0            94           752
-----------------------------------------------------------------------------------------
l3_precision_nutriti                     2            6             0             0
            baghem1                    2            6             0             0
-----------------------------------------------------------------------------------------
l3_varc                                1            7             0             0
            zhaot4                     1            7             0             0
-----------------------------------------------------------------------------------------
l3_vuiis_cci                           1            2             0             0
            vuiis_daily_s              1            2             0             0
-----------------------------------------------------------------------------------------
l3_watson_lab                         10           37             0             0
            licerav                   10           37             0             0
-----------------------------------------------------------------------------------------
maiziezhou_lab                         1           50             0             0
            tangk10                    1           50             0             0
-----------------------------------------------------------------------------------------
mcml                                   0            0             1           128
            odenyogg                   0            0             1           128
-----------------------------------------------------------------------------------------
nbody                                109          424            88           340
            ligo                     109          424            88           340
-----------------------------------------------------------------------------------------
p_csb_meiler                         145          145           575           575
            huntek1                  145          145           575           575
-----------------------------------------------------------------------------------------
p_masi                                34           68         27860         27860
            kimm58                    34           68         27043         27043
            rudravg                    0            0           817           817
-----------------------------------------------------------------------------------------
p_matheny_lab                          2            6             0             0
            koolajd1                   2            6             0             0
-----------------------------------------------------------------------------------------
p_neuert_lab                          38          304             0             0
            neuertg                   38          304             0             0
-----------------------------------------------------------------------------------------
rer                                    2           24             0             0
            haidars                    1            8             0             0
            hum6                       1           16             0             0
-----------------------------------------------------------------------------------------
rokaslab                               3           42             0             0
            danist                     1            2             0             0
            davidkt                    1           16             0             0
            hatmakea                   1           24             0             0
-----------------------------------------------------------------------------------------
sbcs                                   0            0             1             1
            lyul1                      0            0             1             1
-----------------------------------------------------------------------------------------
taylor_group                         599          601             0             0
            gersbaka                 278          278             0             0
            petrop3                    1            3             0             0
            schultls                 320          320             0             0
-----------------------------------------------------------------------------------------
tk_lab                                 1           40             0             0
            yoonh15                    1           40             0             0
-----------------------------------------------------------------------------------------
walker_lab                             5          160             0             0
            walkeas2                   5          160             0             0
-----------------------------------------------------------------------------------------
wankowicz_lab                       1807         2104         65026         68176
            wankows                 1807         2104         65026         68176
-----------------------------------------------------------------------------------------
wan_lab                                1           20             0             0
            hardenn                    1           20             0             0
-----------------------------------------------------------------------------------------
womelsdorf_lab                        20          191             1            10
            gerritcg                  20          191             1            10
-----------------------------------------------------------------------------------------
yang_lab                               1            1             0             0
            shaoq1                     1            1             0             0
-----------------------------------------------------------------------------------------
yang_lab_csb                           2           20           100          1000
            jurichc                    2           20           100          1000
-----------------------------------------------------------------------------------------
Totals:                             3465         9158        103302        114755

Queue Summary (Batch GPU)

GROUP        USER                  ACTIVE_JOBS  ACTIVE_GPUS   PENDING_JOBS   PENDING_GPUS
-----------------------------------------------------------------------------------------
accre_guests_acc                       6            6             2             2
            liy110                     3            3             1             1
            tany14                     3            3             1             1
-----------------------------------------------------------------------------------------
csb_gpu_acc                           13           14             1             1
            karadim                    0            0             1             1
            zahadg                    12           12             0             0
            zhengm9                    1            2             0             0
-----------------------------------------------------------------------------------------
mltf_acc                               0            0             1             8
            sheny19                    0            0             1             8
-----------------------------------------------------------------------------------------
nbody_acc                              1            8             0             0
            khanfm                     1            8             0             0
-----------------------------------------------------------------------------------------
p_meiler_acc                           1            1             0             0
            mothcw                     1            1             0             0
-----------------------------------------------------------------------------------------
Totals:                               21           29             4            11

Queue Summary (interactive)

GROUP        USER                  ACTIVE_JOBS  ACTIVE_CORES  PENDING_JOBS  PENDING_CORES
-----------------------------------------------------------------------------------------
maiziezhou_lab_int                     3           77             0             0
            chowx                      2            7             0             0
            yuanw2                     1           70             0             0
-----------------------------------------------------------------------------------------
Totals:                                3           77             0             0

Partition Summary

PARTITION       AVAIL  TIMELIMIT  NODES  STATE NODELIST
interactive        up 14-00:00:0      1  drain cn0001
interactive        up 14-00:00:0      3    mix cn[1301,1800,1804]
interactive        up 14-00:00:0     13   idle cn[1287,1302,1322-1326,1328-1330,1801-1803]
batch*             up 14-00:00:0     15 drain* cn[1376,1379,1398,1413,1521,1541,1562,1566,1570,1572,1574-1575,1578,1585,1587]
batch*             up 14-00:00:0      2  down* cn[1529,1573]
batch*             up 14-00:00:0      1   drng cn1544
batch*             up 14-00:00:0      8  drain cn[1368,1436,1514,1539,1561,1582,1611,2000]
batch*             up 14-00:00:0    197    mix cn[1212,1236,1257,1260,1264,1268,1271-1280,1292,1294,1303-1304,1309-1310,1313-1314,1316,1320-1321,1331-1332,1336-1343,1346-1348,1350,1353-1354,1362,1373-1374,1389-1391,1393,1395,1399-1408,1410-1412,1415,1417-1419,1421-1422,1424-1427,1430-1432,1434-1435,1438-1440,1442-1443,1445-1458,1460-1461,1463-1464,1466-1472,1474-1479,1481-1483,1486-1489,1491,1494-1496,1498,1504,1506,1508,1511,1513,1516,1524-1525,1536-1538,1540,1545-1546,1549-1550,1553,1557-1559,1564-1565,1567-1568,1571,1576-1577,1579-1580,1584,1588-1589,1591-1592,1594-1597,1602-1610,1612-1625,1627-1628,1700-1706]
batch*             up 14-00:00:0    168  alloc cn[1202-1211,1213-1235,1237-1242,1258-1259,1261-1262,1265-1267,1269-1270,1281-1286,1288-1291,1293,1295-1299,1305,1308,1312,1327,1333-1335,1344-1345,1349,1351-1352,1355,1357-1361,1363-1367,1369-1372,1375,1377-1378,1380-1385,1387-1388,1392,1394,1396-1397,1409,1414,1416,1420,1423,1437,1441,1462,1473,1480,1484-1485,1490,1492-1493,1497,1499-1503,1505,1507,1509-1510,1512,1515,1517-1520,1522-1523,1526-1528,1530-1535,1543,1547-1548,1551-1552,1554-1556,1563,1569,1581,1583,1586,1593,1626,1629-1633]
legacy_hw       inact 14-00:00:0      0    n/a 
batch_gpu          up 14-00:00:0      2  inval gpu0300,hgx01
batch_gpu          up 14-00:00:0     17 drain* gpu[0014,0016,0025,0030,0036,0038,0040-0041,0044-0045,0048,0057,0060-0061,0081-0083]
batch_gpu          up 14-00:00:0      1  drain gpu0037
batch_gpu          up 14-00:00:0     19    mix gpu[0035,0039,0042,0062-0067,0069-0072,0074-0076,0084,0301-0302]
batch_gpu          up 14-00:00:0     39   idle gpu[0013,0015,0017-0024,0026-0027,0031,0033-0034,0046,0049-0050,0053,0056,0059,0068,0073,0077-0080,0085,0303-0310],gracehopper[01-02],hgx02
interactive_gpu    up 14-00:00:0      1   idle gpu0058
sam                up 2-02:00:00      1  alloc cms-sam-01
sam                up 2-02:00:00      1   idle cms-sam-02
dgx_temp           up 14-00:00:0      3    mix dgx[01,03-04]
dgx_temp           up 14-00:00:0      1   idle dgx02