ACCRE R9 Cluster Quick and Dirty Status

Report generated at Sun Apr 6 10:23:01 AM CDT 2025

Problem Nodes

HOSTNAMES      STATE      AVAIL_FEATURES                 TIMESTAMP            USER       REASON                                              
cn457          drained*   sandybridge                    2025-04-05T08:02:25  root       Nobody - NA - bad IPMI, decom                       
cn1131         drained*   sandybridge                    2025-02-06T16:56:30  slurm      Sai - RT 90326 - Requires troubleshooting configurat
cn1259         draining   haswell                        2025-04-05T20:17:30  root       Kill task failed                                    
cn1298         drained*   haswell                        2025-04-05T08:46:30  root       Nobody - RT91527 - node will not boot properly      
cn1308         draining   haswell                        2025-04-05T20:27:10  root       Kill task failed                                    
cn1377         drained    haswell                        2025-02-24T12:51:55  appelte1   Nobody - RT90691 - check memory instability         
cn1399         drained*   haswell                        2025-02-27T09:52:56  root       Scott Took - RT 90327 - Not reachable               
cn1413         drained*   haswell,sbcstmp                2025-03-25T16:59:30  ninchrsc   Samuel - Bad - Decom for part                       
cn1424         drained*   haswell                        2025-03-17T14:09:17  root       Provisioning - Scott - resume when R9 and GREEN     
cn1427         drained*   haswell                        2025-03-13T07:24:41  appelte1   Provisioning - Nobody - resume when R9 and GREEN    
cn1436         drained*   skylake                        2025-04-05T11:18:12  root       Nobody - RT91528 - internal NIC is flapping         
cn1446         down       skylake                        2025-04-06T01:51:02  slurm      Node unexpectedly rebooted                          
cn1450         down       skylake                        2025-04-05T19:30:00  slurm      Node unexpectedly rebooted                          
cn1461         down       skylake                        2025-04-05T21:41:48  slurm      Node unexpectedly rebooted                          
cn1488         drained    skylake                        2025-04-05T18:04:24  root       Eric - NA - Kill task failed, drain and reboot      
cn1493         down       skylake                        2025-04-05T23:30:24  slurm      Node unexpectedly rebooted                          
cn1504         drained    skylake                        2025-02-16T13:51:45  slurm      Nobody - 90507 - NIC flapping : Not responding      
cn1510         draining   skylake                        2025-04-05T18:04:24  root       Eric - NA - Kill task failed, drain and reboot      
cn1514         drained    skylake                        2025-03-17T09:07:25  slurm      Nobody - RT91150 - Network link flapping : Not respo
cn1518         drained    skylake                        2025-04-03T11:44:28  root       Nobody - RT91488 - down, ipmi is also not reachable 
cn1521         drained*   skylake                        2025-04-05T08:07:35  root       Nobody - RT91525 - bad dimm                         
cn1524         down       skylake                        2025-04-05T19:14:30  slurm      Node unexpectedly rebooted                          
cn1534         down       skylake                        2025-04-05T19:19:17  slurm      Node unexpectedly rebooted                          
cn1539         drained    cascadelake                    2025-02-05T16:04:39  root       Alan - RT N/A - NO TOUCHY?                          
cn1541         drained*   cascadelake                    2025-02-28T16:01:10  root       Samuel - Bad - Decom for parts                      
cn1543         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1546         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1552         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1553         drained    cascadelake                    2025-03-21T08:57:37  appelte1   Nobody - RT91248 - undiagnosed system instability   
cn1556         drained    cascadelake                    2025-03-21T08:57:37  appelte1   Nobody - RT91248 - undiagnosed system instability   
cn1557         drained    cascadelake                    2025-03-21T08:57:37  appelte1   Nobody - RT91248 - undiagnosed system instability   
cn1561         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1564         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1572         drained*   cascadelake                    2025-04-04T06:20:45  slurm      Samuel - RT91355 - Bad Batteries : Not responding   
cn1575         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1577         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1580         drained*   cascadelake                    2025-03-13T07:24:41  appelte1   Provisioning - Nobody - resume when R9 and GREEN    
cn1588         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1590         drained*   cascadelake                    2025-04-03T11:44:28  root       Nobody - RT91488 - down, ipmi is also not reachable 
cn1611         drained    zen                            2025-02-28T10:53:50  root       Nobody - RT90770 - system instability causing reboot
cn1629         draining   zen                            2025-04-05T08:36:20  root       Nobody - NA - tmp space exceeded, let drain and clea
gpu0016        drained*   broadwell,pascal,p3584         2025-03-25T08:20:38  root       Nobody - RT 90330 - Migrate to R9                   
gpu0024        drained*   broadwell,pascal,p3840         2025-03-25T08:20:38  root       Nobody - RT 90330 - Migrate to R9                   
gpu0037        drained    skylake,turing,csbtmp          2025-04-05T07:50:03  root       Nobody - RT91524 - frequent reboots                 
gpu0040        inval      skylake,turing,csbtmp          2025-04-05T07:48:13  root       Nobody - 91523 - Possible bad DIMM                  
gpu0044        inval      skylake,turing,csbtmp          2025-04-05T07:46:42  root       Nobody - 91494 - Failed GPU crashing kernel         
gpu0053        drained*   skylake,turing                 2025-03-13T07:23:41  appelte1   Provisioning - Nobody - resume when R9 and GREEN    
gpu0057        drained*   cascadelake,turing             2025-04-05T07:40:48  root       Nobody - RT91522 - unstable system                  
gpu0058        drained*   skylake,a4000x4                2025-03-11T07:43:50  appelte1   Provisioning - Nobody - resume when R9 and GREEN    
gpu0083        drained*   zen,a100                       2025-04-05T07:39:36  root       Scott - RT90395 - bad hardware                      
gracehopper01  drained*   aarch,hopper                   2025-04-03T11:12:48  root       Nobody - Provisioning - Migrating to Rocky 9 infra  
gracehopper02  drained*   aarch,hopper                   2025-04-03T11:12:48  root       Nobody - Provisioning - Migrating to Rocky 9 infra  
hgx01          drained*   zen,h100                       2025-04-05T07:45:25  root       Sam - NA - Provisioning                             

Queue Summary (Batch)

GROUP        USER                  ACTIVE_JOBS  ACTIVE_CORES  PENDING_JOBS  PENDING_CORES
-----------------------------------------------------------------------------------------
aldrich_lab                            1            1             0             0
            amannn1                    1            1             0             0
-----------------------------------------------------------------------------------------
beam_lab                             114          204             0             0
            khancm                   108          108             0             0
            zhuj29                     6           96             0             0
-----------------------------------------------------------------------------------------
booth_lab                              5            5             0             0
            chenh55                    2            2             0             0
            muesm                      2            2             0             0
            wanj129                    1            1             0             0
-----------------------------------------------------------------------------------------
brg_cores                             14           81             0             0
            desilvt                   13           65             0             0
            kandelr                    1           16             0             0
-----------------------------------------------------------------------------------------
calipari_lab                           1            8             0             0
            barthb1                    1            8             0             0
-----------------------------------------------------------------------------------------
castiglione_lab                        1            1             0             0
            eisj                       1            1             0             0
-----------------------------------------------------------------------------------------
cms                                 1324         4582           101           371
            cmslocal                   3            9             0             0
            cmspilot                1321         4573           101           371
-----------------------------------------------------------------------------------------
hadjim_lab                             3           16             0             0
            comers                     1            8             0             0
            reasosa2                   2            8             0             0
-----------------------------------------------------------------------------------------
h_vuiis                                2            2             0             0
            vuiis_daily_s              2            2             0             0
-----------------------------------------------------------------------------------------
l3_bick_lab                            2           40             0             0
            qianh4                     2           40             0             0
-----------------------------------------------------------------------------------------
l3_precision_nutriti                     1            8             0             0
            baghem1                    1            8             0             0
-----------------------------------------------------------------------------------------
mcml                                   1           16             0             0
            subravvr                   1           16             0             0
-----------------------------------------------------------------------------------------
nakagawa_lab                           1            1             0             0
            nakagat1                   1            1             0             0
-----------------------------------------------------------------------------------------
nbody                                197          656            48           192
            ligo                     197          656            48           192
-----------------------------------------------------------------------------------------
palmeri_lab                          511          511             0             0
            jeongj6                  511          511             0             0
-----------------------------------------------------------------------------------------
p_csb_meiler                          55          764             0             0
            cheonglb                   1            8             0             0
            tydingcw                  54          756             0             0
-----------------------------------------------------------------------------------------
p_masi                               300          545          1484          2558
            amandm1                  101          201          1074          2148
            kimm58                   151          152           410           410
            liz79                     48          192             0             0
-----------------------------------------------------------------------------------------
p_matheny_lab                          1            2             0             0
            koolajd1                   1            2             0             0
-----------------------------------------------------------------------------------------
rokaslab                              31           90             5             5
            davidkt                   30           30             5             5
            riedlio                    1           60             0             0
-----------------------------------------------------------------------------------------
sbcs                                  18           36             1             1
            lyul1                     18           36             1             1
-----------------------------------------------------------------------------------------
walker_lab                             7          132             0             0
            fieldhm                    1            1             0             0
            kalmertl                   5          129             0             0
            walkeas2                   1            2             0             0
-----------------------------------------------------------------------------------------
wankowicz_lab                          3           48             0             0
            lix83                      3           48             0             0
-----------------------------------------------------------------------------------------
yang_lab_csb                           1            1             0             0
            yanglab_enzyh              1            1             0             0
-----------------------------------------------------------------------------------------
Totals:                             2594         7750          1639          3127

Queue Summary (Batch GPU)

GROUP        USER                  ACTIVE_JOBS  ACTIVE_GPUS   PENDING_JOBS   PENDING_GPUS
-----------------------------------------------------------------------------------------
csb_gpu_acc                            6           18             5             5
            karadim                    0            0             5             5
            melarafj                   2            2             0             0
            nakagat1                   4           16             0             0
-----------------------------------------------------------------------------------------
vuiis_masi_gpu_acc                     0            0             7             8
            liuy140                    0            0             1             2
            vuiis_daily_s              0            0             6             6
-----------------------------------------------------------------------------------------
wei_lab_acc                            1            2             0             0
            suw3                       1            2             0             0
-----------------------------------------------------------------------------------------
yang_lab_csb                           0            0             1             1
            ranx                       0            0             1             1
-----------------------------------------------------------------------------------------
Totals:                                7           20            13            14

Queue Summary (interactive)

GROUP        USER                  ACTIVE_JOBS  ACTIVE_CORES  PENDING_JOBS  PENDING_CORES
-----------------------------------------------------------------------------------------
p_dsi_dgx                              3          132             0             0
            huy28                      2          128             0             0
            wut18                      1            4             0             0
-----------------------------------------------------------------------------------------
Totals:                                3          132             0             0

Partition Summary

PARTITION   AVAIL  TIMELIMIT  NODES  STATE NODELIST
interactive    up 14-00:00:0      2    mix dgx[01,03]
interactive    up 14-00:00:0      4   idle cn[0001,1287],dgx[02,04]
batch*         up 14-00:00:0     11 drain* cn[1298,1399,1413,1424,1427,1436,1521,1541,1572,1580,1590]
batch*         up 14-00:00:0      4   drng cn[1259,1308,1510,1629]
batch*         up 14-00:00:0     18  drain cn[1377,1488,1504,1514,1518,1539,1543,1546,1552-1553,1556-1557,1561,1564,1575,1577,1588,1611]
batch*         up 14-00:00:0    196    mix cn[1201-1211,1213-1215,1217-1225,1227-1229,1231-1232,1235-1242,1258,1262,1265-1270,1272,1275-1277,1286,1288-1291,1294-1297,1299,1305,1309,1312-1314,1316,1327,1364,1400-1401,1403-1410,1412,1414-1418,1420,1423,1430-1432,1434-1435,1437-1441,1443,1445,1447-1449,1451,1453-1457,1460,1467,1469-1471,1473-1480,1483,1485-1487,1489-1490,1495-1497,1499,1502-1503,1505,1515,1517,1522,1528,1530-1533,1536,1538,1540,1544-1545,1549-1551,1555,1559,1565,1567,1571,1576,1579,1581-1584,1586-1587,1591,1593-1597,1601-1610,1613-1617,1619,1623-1628,1630-1633,1700,1702-1703,1705]
batch*         up 14-00:00:0     87  alloc cn[1212,1216,1226,1230,1233-1234,1257,1260-1261,1264,1271,1273-1274,1278-1285,1292-1293,1303-1304,1310,1402,1411,1419,1425-1426,1442,1452,1458,1462-1464,1466,1468,1472,1481-1482,1484,1491-1492,1494,1498,1500-1501,1506-1509,1511-1513,1516,1519-1520,1523,1525-1527,1529,1535,1537,1547-1548,1554,1558,1562-1563,1566,1568-1570,1573-1574,1578,1585,1589,1592,1612,1618,1620-1622]
batch*         up 14-00:00:0      6   down cn[1446,1450,1461,1493,1524,1534]
legacy_hw      up 14-00:00:0      2 drain* cn[457,1131]
legacy_hw      up 14-00:00:0      1  alloc cn1124
batch_gpu      up 14-00:00:0      2  inval gpu[0040,0044]
batch_gpu      up 14-00:00:0      9 drain* gpu[0016,0024,0053,0057-0058,0083],gracehopper[01-02],hgx01
batch_gpu      up 14-00:00:0      1  drain gpu0037
batch_gpu      up 14-00:00:0      5    mix gpu[0036,0039,0067,0085,0301]
batch_gpu      up 14-00:00:0      4  alloc gpu[0062-0065]
batch_gpu      up 14-00:00:0     28   idle gpu[0041-0042,0050,0056,0060-0061,0066,0068-0076,0084,0300,0302-0310],hgx02
sam            up 2-02:00:00      2   idle cms-sam-[01-02]