瀏覽代碼

add system health check

shjung 2 周之前
父節點
當前提交
d3922ff1db

+ 1 - 0
evps-comm-server/src/main/java/com/evps/comm/server/config/ApplicationConfig.java

@@ -22,6 +22,7 @@ public class ApplicationConfig extends NettyServerConfig {
     private boolean loggingThread = false;
 
     private String processId = "evps-comm-server";
+    private double cpuLimits = 75;
     private boolean packetDebug = true;
     private int autoEndMinutes = 20;
     private int lastCommTimeoutSeconds = 60;

+ 97 - 15
evps-comm-server/src/main/java/com/evps/comm/server/scheduler/ApplicationScheduler.java

@@ -1,17 +1,26 @@
 package com.evps.comm.server.scheduler;
 
+import com.evps.comm.server.config.ApplicationConfig;
 import com.evps.comm.server.config.TraceConfig;
 import com.evps.comm.server.repository.ApplicationRepository;
 import com.evps.comm.server.service.EvpsServiceManagerService;
 import com.evps.comm.server.service.UnitSystService;
+import com.evps.common.utils.SystemHealth;
+import com.zaxxer.hikari.HikariDataSource;
+import com.zaxxer.hikari.HikariPoolMXBean;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
-import org.springframework.scheduling.annotation.Async;
 import org.springframework.scheduling.annotation.EnableScheduling;
 import org.springframework.scheduling.annotation.Scheduled;
 import org.springframework.stereotype.Component;
 
 import javax.annotation.PreDestroy;
+import java.lang.management.ManagementFactory;
+import java.lang.management.ThreadInfo;
+import java.lang.management.ThreadMXBean;
+import java.text.DecimalFormat;
+import java.util.EnumMap;
+import java.util.Map;
 
 @Slf4j
 @RequiredArgsConstructor
@@ -20,41 +29,54 @@ import javax.annotation.PreDestroy;
 public class ApplicationScheduler {
 
     private final TraceConfig traceConfig;
+    private final ApplicationConfig config;
     private final UnitSystService unitSystService;
     private final ApplicationRepository applicationRepository;
     private final EvpsServiceManagerService evpsServiceManagerService;
 
+    private final HikariDataSource dataSource;
+    private final Map<Thread.State, Integer> stateCountMap = new EnumMap<>(Thread.State.class);
+    private final SystemHealth systemHealth = new SystemHealth();
+    private final DecimalFormat df = new DecimalFormat("#.##");
+
     @PreDestroy
     public void onShutDown() {
         log.info("ApplicationScheduler.onShutDown: Shutting down...");
     }
 
-//    @ScheduleElapsed
-    @Async
     @Scheduled(cron = "0 * * * * *")
     public void updateProcessState() {
+        try {
+            this.systemHealth.checkHealth(false);
+            loggingHealthCheck();
+
+            double cpuUsage = this.systemHealth.getCpuUsage();
+            if (cpuUsage > this.config.getCpuLimits()) {
+//                ApplicationRepository.processState.setErrDesc("CPU 사용율이 너무 높음: " + String.format("%.2f", cpuUsage));
+                log.warn("[SKIP] ApplicationScheduler.updateProcessState: High CPU Usage, Limit({} %), Current({} %), Schedule Job SKIP...",
+                        this.config.getCpuLimits(), String.format("%.2f", cpuUsage));
+                loggingThreads();
+                return;
+            }
+        } catch (Exception e) {
+            log.error("ApplicationScheduler.updateProcessState: System Health Check Exception {}", e.getMessage());
+        }
+
         try {
             this.unitSystService.updateUnitSystStts();
         }
         catch(Exception e) {
             log.error("ApplicationScheduler.updateProcessState: Exception {}", e.getMessage());
         }
-    }
 
-    //    @ScheduleElapsed
-    @Async
-    @Scheduled(cron = "30 * * * * *")  // 1분주기 작업 실행
-    public void loadTraceConfig() {
         try {
             this.traceConfig.loadTraceInfo();
         }
         catch(Exception e) {
-            log.error("ApplicationScheduler.loadTraceConfig: Exception {}", e.getMessage());
+            log.error("ApplicationScheduler.loadTraceInfo: Exception {}", e.getMessage());
         }
     }
 
-    //    @ScheduleElapsed
-    @Async
     @Scheduled(cron = "0/30 * * * * *")
     public void serviceMangerSchedule() {
         try {
@@ -65,8 +87,6 @@ public class ApplicationScheduler {
         }
     }
 
-//    @ScheduleElapsed
-    @Async
     @Scheduled(cron = "20 * * * * *")  // 1분주기 작업 실행
     public void reportCenterSessions() {
         try {
@@ -77,8 +97,6 @@ public class ApplicationScheduler {
         }
     }
 
-//    @ScheduleElapsed
-    @Async
     @Scheduled(cron = "10 0/5 * * * *")  // 5분주기 작업 실행
     public void loadBaseDatabase() {
         try {
@@ -89,4 +107,68 @@ public class ApplicationScheduler {
         }
     }
 
+    private void loggingHealthCheck() {
+        log.info("----------------------------------------------------------------------------------------------------------");
+        loggingSystemHealth();
+        logSessionStatus();
+        loggingThreadState();
+        log.info("----------------------------------------------------------------------------------------------------------");
+    }
+
+    private void loggingSystemHealth() {
+        double loadAvg = this.systemHealth.getLoadAverage();
+        String loadAvgStr = (loadAvg < 0) ? "N/A" : this.df.format(loadAvg);
+        log.info("SYSTEM HEALTH: CPU {} Cores[{} %, LoadAvg({})], Memory[{} %, Used({} MB), Max({} MB)], Threads[{}, Peak({})], GC[{}, {} ms]",
+                this.systemHealth.getCpuCores(),
+                this.df.format(this.systemHealth.getCpuUsage()),
+                loadAvgStr,
+                this.df.format(this.systemHealth.getMemUsage()),
+                this.systemHealth.getUsedMemory(),
+                this.systemHealth.getMaxMemory(),
+                this.systemHealth.getThreadCount(),
+                this.systemHealth.getPeakThreadCount(),
+                this.systemHealth.getGcCount(),
+                this.systemHealth.getGcTime());
+    }
+
+    private void logSessionStatus() {
+        HikariPoolMXBean poolStats = this.dataSource.getHikariPoolMXBean();
+        int totalConnections = poolStats.getTotalConnections();
+        int activeConnections = poolStats.getActiveConnections();
+        int idleConnections = poolStats.getIdleConnections();
+        int threadsAwaiting = poolStats.getThreadsAwaitingConnection();
+        log.info("   DB SESSION: Total: {}, Active: {}, Idle: {}, Waiting: {}",
+                totalConnections, activeConnections, idleConnections, threadsAwaiting);
+    }
+
+    private void loggingThreadState() {
+        ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();
+        ThreadInfo[] infos = threadBean.dumpAllThreads(false, false);
+        this.stateCountMap.clear();
+        for (ThreadInfo info : infos) {
+            Thread.State state = info.getThreadState();
+            this.stateCountMap.put(state, this.stateCountMap.getOrDefault(state, 0) + 1);
+        }
+        StringBuilder sb = new StringBuilder();
+        for (Map.Entry<Thread.State, Integer> entry : stateCountMap.entrySet()) {
+            sb.append(String.format("%s(%d), ", entry.getKey().name(), entry.getValue()));
+        }
+        // 마지막 쉼표 제거
+        if (sb.length() > 0) {
+            sb.setLength(sb.length() - 2);
+        }
+        log.info(" THREAD STATE: {}", sb.toString());
+    }
+
+    private void loggingThreads() {
+        ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();
+        ThreadInfo[] infos = threadBean.dumpAllThreads(false, false);
+        this.stateCountMap.clear();
+        for (ThreadInfo info : infos) {
+            Thread.State state = info.getThreadState();
+            this.stateCountMap.put(state, this.stateCountMap.getOrDefault(state, 0) + 1);
+            log.info("         [" + info.getThreadId() + "] " + info.getThreadName() + " - " + info.getThreadState());
+        }
+    }
+
 }

+ 1 - 1
evps-common/src/main/java/com/evps/Main.java → evps-common/src/main/java/com/evps/common/Main.java

@@ -1,4 +1,4 @@
-package com.evps;
+package com.evps.common;
 
 //TIP To <b>Run</b> code, press <shortcut actionId="Run"/> or
 // click the <icon src="AllIcons.Actions.Execute"/> icon in the gutter.

+ 14 - 0
evps-common/src/main/java/com/evps/common/utils/GcStats.java

@@ -0,0 +1,14 @@
+package com.evps.common.utils;
+
+import lombok.Getter;
+
+@Getter
+public class GcStats {
+    private final long count;
+    private final long time;
+
+    public GcStats(long count, long time) {
+        this.count = count;
+        this.time = time;
+    }
+}

+ 144 - 0
evps-common/src/main/java/com/evps/common/utils/SystemHealth.java

@@ -0,0 +1,144 @@
+package com.evps.common.utils;
+
+import com.sun.management.OperatingSystemMXBean;
+import lombok.Getter;
+
+import java.io.File;
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.lang.management.MemoryMXBean;
+import java.time.Instant;
+import java.time.ZoneId;
+import java.util.HashMap;
+import java.util.Map;
+
+@Getter
+public class SystemHealth {
+
+    private String osName = System.getProperty("os.name");
+    private String osVersion = System.getProperty("os.version");
+    private int cpuCores = Runtime.getRuntime().availableProcessors();
+    private long maxMemory = Runtime.getRuntime().maxMemory();
+
+    private long totalMemory = 0;   // MB
+    private long freeMemory = 0;    // MB
+    private long usedMemory = 0;    // MB
+    private long heapUsed;
+    private long nonHeapUsed;
+
+    private double memUsage = 0.;
+    private double cpuUsage = 0.;
+    private double loadAverage;         // CPU 부하
+
+    private int threadCount = 0;    // 현재스레드 수
+    private int peakThreadCount;    // 어플리케이션 운영중 최대 스레드 수
+
+    private long jvmUptime;             // milliseconds, - 예: uptimeMillis = 3600000 → JVM이 1시간(60분) 동안 실행 중이라는 뜻
+    private long loadedClassCount;
+
+//    private final Map<String, Double> diskUsageMap = new HashMap<>();
+    private long diskTotal;
+    private long diskFree;
+    private double diskUsage;
+
+    private final Map<String, GcStats> gcStatsMap = new HashMap<>();
+    private String gcName;
+    private long gcCount = 0;
+    private long gcTime = 0;
+    private String jvmStartTime;
+
+    public SystemHealth() {
+        this.jvmStartTime = Instant.ofEpochMilli(
+                ManagementFactory.getRuntimeMXBean().getStartTime()
+        ).atZone(ZoneId.systemDefault()).toString();
+
+        Runtime runtime = Runtime.getRuntime();
+        this.maxMemory = runtime.maxMemory() / 1024 / 1024;
+        this.cpuCores = runtime.availableProcessors();
+        this.osName = System.getProperty("os.name");
+        this.osVersion = System.getProperty("os.version");
+    }
+
+    public void checkHealth(boolean useDisk) {
+        Runtime runtime = Runtime.getRuntime();
+        this.totalMemory = runtime.totalMemory();   // Byte
+        this.freeMemory = runtime.freeMemory();     // Byte
+        this.usedMemory = this.totalMemory  - this.freeMemory;  // Byte
+
+        MemoryMXBean memoryMXBean = ManagementFactory.getMemoryMXBean();
+        this.heapUsed = memoryMXBean.getHeapMemoryUsage().getUsed() / 1024 / 1024;
+        this.nonHeapUsed = memoryMXBean.getNonHeapMemoryUsage().getUsed() / 1024 / 1024;
+
+        this.totalMemory = this.totalMemory / 1024 / 1024;
+        this.freeMemory = this.freeMemory / 1024 / 1024;
+        this.usedMemory = this.usedMemory / 1024 / 1024;
+
+//        System.out.println("사용 중인 메모리: " + (usedMemory / 1024 / 1024) + " MB");
+//        System.out.println("최대 메모리: " + (maxMemory / 1024 / 1024) + " MB");
+
+        this.memUsage = (double) this.usedMemory / this.maxMemory * 100;
+        // For Linux/Unix, Windows(Perhaps)
+        OperatingSystemMXBean osBean = (OperatingSystemMXBean) ManagementFactory.getOperatingSystemMXBean();
+        this.cpuUsage = osBean.getSystemCpuLoad() * 100;
+        this.loadAverage = osBean.getSystemLoadAverage(); // 1분 평균
+
+        this.threadCount = ManagementFactory.getThreadMXBean().getThreadCount();
+        this.peakThreadCount = ManagementFactory.getThreadMXBean().getPeakThreadCount();
+
+        this.jvmUptime = ManagementFactory.getRuntimeMXBean().getUptime();
+//        long uptimeSeconds = this.jvmUptime / 1000;
+//        long uptimeMinutes = uptimeSeconds / 60;
+//        long uptimeHours = uptimeMinutes / 60;
+//        long uptimeDays = uptimeHours / 24;
+//        System.out.println("JVM Uptime: "
+//                + uptimeDays + "d "
+//                + (uptimeHours % 24) + "h "
+//                + (uptimeMinutes % 60) + "m "
+//                + (uptimeSeconds % 60) + "s");
+
+
+        this.loadedClassCount = ManagementFactory.getClassLoadingMXBean().getLoadedClassCount();
+
+        if (useDisk) {
+            File root = new File("/");
+            if (root.exists() && root.canRead()) {
+                this.diskTotal = root.getTotalSpace();
+                this.diskFree = root.getFreeSpace();
+                long used = this.diskTotal - this.diskFree;
+                this.diskUsage = (double) used / this.diskTotal * 100;
+            }
+        }
+
+        for (GarbageCollectorMXBean gc : ManagementFactory.getGarbageCollectorMXBeans()) {
+            this.gcName = gc.getName();
+            this.gcCount += gc.getCollectionCount();
+            this.gcTime += gc.getCollectionTime();
+            gcStatsMap.put(gc.getName(), new GcStats(gc.getCollectionCount(), gc.getCollectionTime()));
+        }
+
+//        for (File root : File.listRoots()) {
+//            if (root.exists() && root.canRead()) {
+//                long total = root.getTotalSpace();
+//                long free = root.getFreeSpace();
+//                double usage = (double) (total - free) / total * 100;
+//                diskUsageMap.put(root.getAbsolutePath(), usage);
+//            }
+//        }
+
+        // GC가 전체 실행 시간의 몇 %를 차지했는지 확인 가능
+//        long uptime = ManagementFactory.getRuntimeMXBean().getUptime(); // ms
+//        long gcTime = this.gcTime;
+//        double gcRatio = (double) gcTime / uptime * 100;
+//        log.info("GC Time: {}ms, JVM Uptime: {}ms, GC 비중: {:.2f}%", gcTime, uptime, gcRatio);
+    }
+
+    public String getFormattedUptime() {
+        long uptimeSeconds = this.jvmUptime / 1000;
+        long uptimeMinutes = uptimeSeconds / 60;
+        long uptimeHours = uptimeMinutes / 60;
+        long uptimeDays = uptimeHours / 24;
+        return uptimeDays + "d " + (uptimeHours % 24) + "h " + (uptimeMinutes % 60) + "m " + (uptimeSeconds % 60) + "s";
+    }
+
+
+}