ACCRE R9 Cluster Quick and Dirty Status
Report generated at Fri Oct 10 04:23:01 PM CDT 2025
Problem Nodes
HOSTNAMES STATE AVAIL_FEATURES TIMESTAMP USER REASON
cn0001 drained x86-64v3 2025-05-22T15:29:13 appelte1 Sam - NA - Hammerspace testing, decom when done
cn1393 draining intel_e5-2630_v3,haswell,intel 2025-09-24T19:23:08 root Kill task failed
cn1398 drained* intel_e5-2630_v3,haswell,intel 2025-08-20T10:01:56 root Scott - NA - Check rack/network
cn1451 drained* intel_4116,skylake,intel,x86-6 2025-09-10T14:19:11 broadrt Troy - RT93447 - backplane connector issue
cn1508 draining intel_4110,skylake,intel,x86-6 2025-09-17T17:55:40 root Kill task failed
cn1539 drained intel_5218,cascadelake,intel,x 2025-02-05T16:04:39 root Alan - RT N/A - NO TOUCHY?
cn1626 drained amd_7313,zen2,zen,amd,x86-64v3 2025-10-01T15:30:41 broadrt Troy - RT95231 - repeated down; not responding
gpu0024 inval pascal,p3840,intel_e5-2623_v4, 2025-10-06T12:32:25 slurm Low RealMemory (reported:192788 < 100.00% of configu
gpu0035 drained* turing,intel_5118,skylake,inte 2025-09-29T09:49:11 broadrt Troy - RT94482 - CPU Voltage issue
gpu0036 drained* turing,intel_5118,skylake,inte 2025-09-29T14:05:42 broadrt Troy - RT91761 - System instability, voltage
gpu0037 drained turing,intel_5118,skylake,inte 2025-09-29T14:06:35 broadrt Troy - RT91524 - frequent reboots
gpu0038 drained* turing,intel_5118,skylake,inte 2025-10-07T16:23:21 slurm gres/gpu count reported lower than configured (0 < 4
gpu0040 inval turing,intel_5118,skylake,inte 2025-10-07T16:19:35 slurm Low RealMemory (reported:353823 < 100.00% of configu
gpu0041 drained* turing,intel_5118,skylake,inte 2025-09-29T14:25:00 broadrt Troy - RT86927 - Memory and GPU issues : Not respond
gpu0044 drained* turing,intel_5118,skylake,inte 2025-09-29T14:28:50 broadrt Troy - RT91494 - Decom per Eric
gpu0045 drained* turing,intel_5118,skylake,inte 2025-09-29T14:29:31 broadrt Troy - Provisioning - resume when R9 and FIXED
gpu0046 inval turing,intel_5118,skylake,inte 2025-10-07T16:19:26 slurm gres/gpu count reported lower than configured (3 < 4
gpu0048 drained* turing,intel_5118,skylake,inte 2025-09-29T14:30:13 broadrt Troy - Provisioning - resume when R9 and FIXED
gpu0049 drained* turing,intel_5118,skylake,inte 2025-09-29T14:30:28 broadrt Troy - gres/gpu count reported lower than configured
gpu0053 drained* turing,intel_5118,skylake,inte 2025-10-07T16:21:06 slurm Troy - gres/gpu count reported lower than configured
gpu0057 drained* turing,intel_4214r,cascadelake 2025-09-16T10:29:50 broadrt Troy - RT91522 - unstable system
gpu0060 drained* a4000,amd_7313,zen3,zen,amd,x8 2025-09-16T10:28:41 broadrt Troy - RT91685 - internal interface firmware issue
gpu0061 drained a4000,amd_7313,zen3,zen,amd,x8 2025-09-29T14:31:44 broadrt Troy - RT91685 - Imaging and setting /tmp partitioni
gpu0062 drained a6000,intel_platinum_8358,icel 2025-10-08T13:28:43 slurm Prolog error
gpu0064 inval a6000,intel_platinum_8358,icel 2025-10-07T16:19:26 slurm gres/gpu count reported lower than configured (0 < 4
gpu0065 drained a6000,intel_platinum_8358,icel 2025-10-10T15:11:28 slurm Sai - N/A - Shutting down to add NVME : Not respondi
gpu0071 drained a6000,intel_platinum_8358,icel 2025-10-10T15:07:17 root Sai - N/A - Shutting down to add NVME
gpu0074 down* a6000,intel_platinum_8358,icel 2025-10-02T14:00:00 slurm Not responding
gpu0084 drained* zen,a100 2025-09-16T09:13:01 root Sai - NA - RMA to Exxact
Queue Summary (Batch)
GROUP USER ACTIVE_JOBS ACTIVE_CORES PENDING_JOBS PENDING_CORES
-----------------------------------------------------------------------------------------
accre_guests 6 6 0 0
connorps 6 6 0 0
-----------------------------------------------------------------------------------------
beam_lab 1 16 0 0
zhuj29 1 16 0 0
-----------------------------------------------------------------------------------------
behringer_lab 1 8 0 0
haleof 1 8 0 0
-----------------------------------------------------------------------------------------
booth_lab 4 11 0 0
chenh55 1 4 0 0
comptoab 1 2 0 0
mathura 1 4 0 0
wanj129 1 1 0 0
-----------------------------------------------------------------------------------------
brg_cores 1 16 0 0
kandelr 1 16 0 0
-----------------------------------------------------------------------------------------
cgg 0 0 1 64
liy110 0 0 1 64
-----------------------------------------------------------------------------------------
cms 184 2900 693 1404
cmslocal 148 1612 137 137
cmspilot 36 1288 556 1267
-----------------------------------------------------------------------------------------
coxlab 1 5 0 0
guaglise 1 5 0 0
-----------------------------------------------------------------------------------------
cqs_si 0 0 4 8
chenarsw 0 0 4 8
-----------------------------------------------------------------------------------------
csb_sanders 18 720 0 0
lig7 18 720 0 0
-----------------------------------------------------------------------------------------
das_lab 2 6 0 0
shiltmh1 2 6 0 0
-----------------------------------------------------------------------------------------
davis_lab 0 0 1 1
salerl1 0 0 1 1
-----------------------------------------------------------------------------------------
econgrads 19 19 0 0
chenl40 19 19 0 0
-----------------------------------------------------------------------------------------
edwards_lab 2 6 0 0
gorejl1 1 5 0 0
lewist5 1 1 0 0
-----------------------------------------------------------------------------------------
finstata_group 1 8 0 0
hand7 1 8 0 0
-----------------------------------------------------------------------------------------
g_gamazon_lab 1 2 0 0
evanspd1 1 2 0 0
-----------------------------------------------------------------------------------------
h_biostat_kang 348 348 0 0
yanb1 348 348 0 0
-----------------------------------------------------------------------------------------
h_biostat_student 5 42 0 0
koy2 1 12 0 0
namy1 1 1 0 0
shil10 1 1 0 0
yih4 2 28 0 0
-----------------------------------------------------------------------------------------
h_cqs 27 273 3 10
shengq1 23 212 0 0
xuh14 3 60 0 0
yangj24 1 1 3 10
-----------------------------------------------------------------------------------------
h_cutting_lab 37 148 613 2452
harrioem 37 148 613 2452
-----------------------------------------------------------------------------------------
h_lu_lab 3 6 3 48
gaow9 3 6 0 0
parkj71 0 0 3 48
-----------------------------------------------------------------------------------------
hodges_lab 1 1 0 0
dayj3 1 1 0 0
-----------------------------------------------------------------------------------------
h_vuiis 2 5 0 0
vuiis_archive 2 5 0 0
-----------------------------------------------------------------------------------------
isde-rer 1 1 0 0
champaca 1 1 0 0
-----------------------------------------------------------------------------------------
l2_jan_lab 12 40 10740 10740
davida7 11 33 10740 10740
olivij1 1 7 0 0
-----------------------------------------------------------------------------------------
l3_aboud_lab 2 16 24 192
hongm1 2 16 24 192
-----------------------------------------------------------------------------------------
l3_below_lab 1 4 0 0
nitinr 1 4 0 0
-----------------------------------------------------------------------------------------
l3_precision_nutrition_lab 2 14 0 0
baghem1 2 14 0 0
-----------------------------------------------------------------------------------------
l3_varc 3 13 0 0
guidubjl 1 7 0 0
zhaot4 2 6 0 0
-----------------------------------------------------------------------------------------
l3_vuiis_cci 2 3 0 0
vuiis_daily_s 2 3 0 0
-----------------------------------------------------------------------------------------
land_lab 10 40 20 60
renx2 10 40 20 60
-----------------------------------------------------------------------------------------
lea_lab 43 344 268 1072
arneram 43 344 268 1072
-----------------------------------------------------------------------------------------
maier_lab 1 12 0 0
poggigp 1 12 0 0
-----------------------------------------------------------------------------------------
mchs_compbio 2 40 0 0
riedlio 2 40 0 0
-----------------------------------------------------------------------------------------
mcml 0 0 2 192
odenyogg 0 0 2 192
-----------------------------------------------------------------------------------------
meydan_lab 2 16 0 0
marksjp 2 16 0 0
-----------------------------------------------------------------------------------------
nbody 165 657 107 416
ligo 164 656 107 416
ruizrkm1 1 1 0 0
-----------------------------------------------------------------------------------------
ng_lab 1 8 0 0
kimj119 1 8 0 0
-----------------------------------------------------------------------------------------
palmeri_lab 91 91 1261 1264
jeongj6 6 6 0 0
lilbus1 85 85 1261 1264
-----------------------------------------------------------------------------------------
p_csb_meiler 97 1332 6451 90314
resv146 2 2 0 0
tydingcw 95 1330 6451 90314
-----------------------------------------------------------------------------------------
p_dsi 0 0 1 1
yangi1 0 0 1 1
-----------------------------------------------------------------------------------------
p_englot_group 1 30 0 0
makhoug 1 30 0 0
-----------------------------------------------------------------------------------------
p_masi 120 240 903 903
saundam1 120 240 903 903
-----------------------------------------------------------------------------------------
p_meiler 0 0 1 6
kaermel 0 0 1 6
-----------------------------------------------------------------------------------------
r_isde 1 4 0 0
trippej1 1 4 0 0
-----------------------------------------------------------------------------------------
rke_group 32 132 0 0
sleethmr 31 124 0 0
yangz31 1 8 0 0
-----------------------------------------------------------------------------------------
rokaslab 52 225 162 328
copea1 2 61 0 0
danist 47 131 160 320
hatmakea 2 8 2 8
riedlio 1 25 0 0
-----------------------------------------------------------------------------------------
rubinov_lab 3 42 0 0
abbasia 1 30 0 0
sardarn 2 12 0 0
-----------------------------------------------------------------------------------------
ruderferlab 1 15 0 0
okehc 1 15 0 0
-----------------------------------------------------------------------------------------
sarkar_lab 1 32 0 0
sarkah1 1 32 0 0
-----------------------------------------------------------------------------------------
sbcs 1 2 0 0
liq17 1 2 0 0
-----------------------------------------------------------------------------------------
stassun 1 60 5 241
medani 1 60 4 240
omalledr 0 0 1 1
-----------------------------------------------------------------------------------------
stein_lab 0 0 1 10
karakg1 0 0 1 10
-----------------------------------------------------------------------------------------
taylor_group 1 3 0 0
petrop3 1 3 0 0
-----------------------------------------------------------------------------------------
tk_lab 3 120 0 0
yoonh15 3 120 0 0
-----------------------------------------------------------------------------------------
vgi 14 196 17 238
parkj71 14 196 17 238
-----------------------------------------------------------------------------------------
walker_lab 4 8 2 2
deanrt 4 8 0 0
kastnpd1 0 0 2 2
-----------------------------------------------------------------------------------------
wankowicz_lab 1228 1228 33886 33886
wankows 1228 1228 33886 33886
-----------------------------------------------------------------------------------------
williams_roberson_lab 2 5 0 0
vundav2 1 4 0 0
yeohb1 1 1 0 0
-----------------------------------------------------------------------------------------
womelsdorf_lab 84 692 108 2344
gerritcg 84 692 108 2344
-----------------------------------------------------------------------------------------
yang_lab 1 8 0 0
shaoq1 1 8 0 0
-----------------------------------------------------------------------------------------
yang_lab_csb 1 10 0 0
jurichc 1 10 0 0
-----------------------------------------------------------------------------------------
zhu_group 1 32 0 0
zhuw12 1 32 0 0
-----------------------------------------------------------------------------------------
Totals: 2651 10261 55277 146196
Queue Summary (Batch GPU)
GROUP USER ACTIVE_JOBS ACTIVE_GPUS PENDING_JOBS PENDING_GPUS
-----------------------------------------------------------------------------------------
accre_guests_acc 4 4 0 0
liy110 4 4 0 0
-----------------------------------------------------------------------------------------
beam_lab_acc 8 8 0 0
pany11 8 8 0 0
-----------------------------------------------------------------------------------------
csb_gpu_acc 13 36 0 0
cryosparcuser 1 4 0 0
karadim 3 6 0 0
zhengm9 9 26 0 0
-----------------------------------------------------------------------------------------
h_oguz_lab_acc 0 0 2 2
oguinekj 0 0 1 1
yaox3 0 0 1 1
-----------------------------------------------------------------------------------------
mchaourab_acc 14 15 27171 27173
kaot1 11 11 27169 27169
may19 1 2 0 0
tangq3 1 1 2 4
wut18 1 1 0 0
-----------------------------------------------------------------------------------------
mchaourab-csb_acc 2 2 0 0
wut18 2 2 0 0
-----------------------------------------------------------------------------------------
mltf_acc 1 4 0 0
sheny19 1 4 0 0
-----------------------------------------------------------------------------------------
nbody_acc 1 8 0 0
khanfm 1 8 0 0
-----------------------------------------------------------------------------------------
p_dsi_acc 2 2 2 2
yangi1 2 2 2 2
-----------------------------------------------------------------------------------------
p_meiler_acc 64 64 6655 6655
tydingcw 64 64 6655 6655
-----------------------------------------------------------------------------------------
psychology_gpu_acc 10 10 26 26
gerritcg 10 10 26 26
-----------------------------------------------------------------------------------------
taylor_group_acc 0 0 2 2
criswea 0 0 2 2
-----------------------------------------------------------------------------------------
Totals: 119 153 33858 33860
Queue Summary (interactive)
GROUP USER ACTIVE_JOBS ACTIVE_CORES PENDING_JOBS PENDING_CORES
-----------------------------------------------------------------------------------------
g_giri_group_int 1 4 0 0
breyem3 1 4 0 0
-----------------------------------------------------------------------------------------
maiziezhou_lab_phd_int 1 30 0 0
yuanw2 1 30 0 0
-----------------------------------------------------------------------------------------
rubinov_lab_int 2 28 0 0
abbasia 1 20 0 0
mohamb2 1 8 0 0
-----------------------------------------------------------------------------------------
Totals: 4 62 0 0
Queue Summary (interactive_gpu)
GROUP USER ACTIVE_JOBS ACTIVE_GPUS PENDING_JOBS PENDING_GPUS
-----------------------------------------------------------------------------------------
Totals: 0 0 0 0
Partition Summary
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
interactive up 14-00:00:0 1 drain cn0001
interactive up 14-00:00:0 4 mix cn[1301,1805,1810-1811]
interactive up 14-00:00:0 21 idle cn[1287,1302,1322-1326,1328-1330,1707,1800-1804,1806-1809,1813]
batch* up 14-00:00:0 2 drain* cn[1398,1451]
batch* up 14-00:00:0 2 drng cn[1393,1508]
batch* up 14-00:00:0 2 drain cn[1539,1626]
batch* up 14-00:00:0 170 mix cn[1202-1203,1205-1206,1212-1213,1216-1219,1221-1222,1226-1228,1230-1231,1233,1240-1241,1257,1260,1264,1272-1276,1280-1282,1286,1297-1299,1304,1309-1310,1312,1314,1316-1317,1331-1335,1341,1352,1364,1366,1369,1371,1375,1377,1380,1385,1388-1391,1394-1395,1399-1400,1403,1407-1408,1410-1412,1414,1417-1420,1424-1425,1427,1430-1432,1434-1436,1438-1439,1442,1445-1446,1448-1450,1452-1453,1455-1458,1460,1462,1466,1471-1473,1475,1477-1479,1481,1489,1492,1499,1506,1509,1523,1525,1530,1533,1535-1537,1540,1543-1546,1548,1550-1551,1553-1554,1557-1559,1565,1571,1574,1576,1579,1582-1584,1588,1592-1596,1603-1606,1608,1614,1620-1621,1624,1627,1629-1633,1700,1702-1703,1706,1708,2000]
batch* up 14-00:00:0 215 alloc cn[1204,1207-1211,1215,1220,1223-1225,1229,1232,1234-1239,1242,1258-1259,1261-1262,1265-1271,1277-1279,1283-1285,1288-1296,1303,1305-1308,1311,1313,1315,1318,1320-1321,1327,1336-1340,1342-1351,1353-1355,1357-1363,1365,1367-1368,1370,1372-1374,1376,1378-1379,1381-1384,1387,1392,1396-1397,1401-1402,1404-1406,1409,1415-1416,1421-1423,1426,1437,1440-1441,1443,1447,1454,1461,1463-1464,1467-1470,1474,1476,1480,1482-1488,1490-1491,1493-1498,1500-1505,1507,1510-1520,1522,1524,1526-1529,1531-1532,1534,1538,1547,1549,1552,1555-1556,1561-1564,1566-1570,1573,1575,1577-1578,1580-1581,1585-1587,1589,1597,1602,1607,1609-1610,1612-1613,1615-1619,1622-1623,1625,1628,1701,1704-1705]
legacy_hw inact 14-00:00:0 0 n/a
batch_gpu up 14-00:00:0 4 inval gpu[0024,0040,0046,0064]
batch_gpu up 14-00:00:0 12 drain* gpu[0035-0036,0038,0041,0044-0045,0048-0049,0053,0057,0060,0084]
batch_gpu up 14-00:00:0 1 down* gpu0074
batch_gpu up 14-00:00:0 5 drain gpu[0037,0061-0062,0065,0071]
batch_gpu up 14-00:00:0 41 mix gpu[0013,0015,0017,0023,0026-0027,0033-0034,0039,0042,0050,0056,0059,0063,0066-0068,0070,0072-0073,0075-0080,0082,0300-0310],gracehopper[01-02],hgx01
batch_gpu up 14-00:00:0 1 alloc gpu0069
batch_gpu up 14-00:00:0 8 idle gpu[0018-0022,0081,0085],hgx02
interactive_gpu up 14-00:00:0 2 mix dgx[01,03]
interactive_gpu up 14-00:00:0 4 idle dgx[02,04],gpu[0058,0207]
sam up 2-02:00:00 2 idle cms-sam-[01-02]
dgx_temp up 14-00:00:0 2 mix dgx[01,03]
dgx_temp up 14-00:00:0 2 idle dgx[02,04]