From f8d0b344ae1b2dc3894c1a597c0565911b762742 Mon Sep 17 00:00:00 2001 From: John Klug Date: Tue, 19 Jan 2021 17:21:32 -0600 Subject: softdog-mon for monitoring a system using kernel module "softdog" --- src/Makefile.am | 6 + src/hog.c | 35 +++++ src/softdog-mon.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 413 insertions(+) create mode 100644 src/Makefile.am create mode 100644 src/hog.c create mode 100644 src/softdog-mon.c (limited to 'src') diff --git a/src/Makefile.am b/src/Makefile.am new file mode 100644 index 0000000..dba3d24 --- /dev/null +++ b/src/Makefile.am @@ -0,0 +1,6 @@ +## Process this file with automake to produce Makefile.in +AUTOMAKE_OPTIONS = gnu +AM_CFLAGS = -Wall + +sbin_PROGRAMS = softdog-mon hog + diff --git a/src/hog.c b/src/hog.c new file mode 100644 index 0000000..d887f4d --- /dev/null +++ b/src/hog.c @@ -0,0 +1,35 @@ +#include +#include +#include +// Memory hog +int +main(int argc, const char *argv[]) +{ + int j,k; + char *p; + int pid; + int mallocsize; + + if (argc < 2) { + fprintf(stderr,"Need malloc size parameter\n"); + exit(1); + } + mallocsize = atoi(argv[1]); + for (k=0; k<5; k++) { + p = malloc(mallocsize); + if (p) + for(j=0;j +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* This program does everything in seconds granularity. + * The time to accomplish a task has the number of nanoseconds + * added in to specify the time for the event to be accomplished. + */ + +char *name = PACKAGE; + +void usage() +{ + syslog(LOG_ERR, "%s (" PACKAGE ") " VERSION " (" __DATE__ " " __TIME__ ")\n", name); + syslog(LOG_ERR, "Copyright (C) 2021 by Multi-Tech Systems\n"); + + syslog(LOG_ERR,"USAGE: feedwatchdog device timeout"); + syslog(LOG_ERR,"Times below are in seconds"); + syslog(LOG_ERR,"ENVIRONMENT:"); + syslog(LOG_ERR," PIDFILE"); + syslog(LOG_ERR," FEED How often to feed watchdog in seconds"); + syslog(LOG_ERR," FILESAMPLERATE How often to open/read/write/close test file"); + syslog(LOG_ERR," MONITORFILE Path to test file"); + syslog(LOG_ERR," MINIMUM_AVAILABLE_MEM Memory Available"); + syslog(LOG_ERR," MINIMUM_FREEHIGH Available high memory"); + syslog(LOG_ERR," MEMSAMPLERATE Sample rate to test memory"); + syslog(LOG_ERR," MEMSAMPLES Samples to accumulate"); + syslog(LOG_ERR," MEMFAILEDSAMPLES Number of samples below threshold before reboot"); + syslog(LOG_ERR," SHUTDOWNTIMEOUT Set watchdog timeout seconds at SIGTERM signal"); + exit(1); +} + + +// Returns true if earlier is before or matches later. +int +timed_out(struct timespec *later, struct timespec *earlier) +{ + if (later->tv_sec > earlier->tv_sec) + return 1; // Time has elapsed by seconds + + if (later->tv_sec == earlier->tv_sec) { + if (later->tv_nsec >= earlier->tv_nsec) + return 1; // Seconds match, and nano-seconds have elapsed + } + return 0; +} + + +// Return the soonest of t1, t2, and t3. Prefers t3 to t2. Prefers t2 to t1. +void +min_time(struct timespec *t1, struct timespec *t2, struct timespec *t3, struct timespec **soonest) { + if(timed_out(t1,t2)) + *soonest = t2; + else + *soonest = t1; + if(timed_out(*soonest,t3)) + *soonest = t3; +} + + +char *pidfile; +int feed; // Seconds between feedings of the watchdog +int filesamplerate; +unsigned long long fileval_longlong = 0x5555555555555555ULL; +int timeout; +char *monitorfile; +unsigned long long minimum_available_mem; +unsigned long long minimum_freehigh; +int memsamplerate; +int memsamples; +int memfailedsamples; +char pidstr[32]; +char *available_samples; +char *freehigh_samples; +int memsample_idx; +struct timespec t0; +struct timespec feed_time; // When to next feed the watchdog +struct timespec file_time; // When to next write to the file +struct timespec mem_time; // When to next test memory +int error_count; // Cumulative errors +int shutdown_timeout; +int devfd = -1; + +void +sigterm_handler(int sig) +{ + struct sigaction hdlr_action; + // Shutdown could be long if doing a flash upgrade + if(devfd != -1) + ioctl(devfd, WDIOC_SETTIMEOUT, &shutdown_timeout); + /* syslog is not safe in a signal handler */ + memset(&hdlr_action,0,sizeof hdlr_action); + hdlr_action.sa_handler = SIG_DFL; + sigaction(SIGTERM,&hdlr_action,NULL); + kill(getpid(),SIGTERM); + _exit(0); +} + +int +main(int argc, char *argv[]) +{ + int result, count; + char *tmp; + int pidfd,monitorfd; + char *watchdogdevicepath; + struct timespec t0, *stime = NULL; + struct sysinfo info; + struct sigaction action; + + openlog(name,LOG_PERROR,LOG_DAEMON); + + memset(&action,0,sizeof action); + action.sa_handler = sigterm_handler; + sigaction(SIGTERM,&action,NULL); + + if (argc < 3) + usage(); + timeout = atoi(argv[2]); + if (timeout < 1) { + syslog(LOG_ERR,"feedwatchdog: timeout must be at least 1"); + usage(); + } + pidfile = secure_getenv("PIDFILE"); + if(pidfile == NULL) { + pidfile = "/run/softdog-mon.pid"; + syslog(LOG_ERR,"feedwatchdog: pidfile is NULL"); + pidfd = open(pidfile,O_WRONLY|O_CREAT|O_TRUNC,0644); + } else { + syslog(LOG_ERR,"feedwatchdog: pidfile is %s",pidfile); + pidfd = open(pidfile,O_WRONLY|O_CREAT|O_TRUNC,0644); + } + + if (pidfd == -1) { + syslog(LOG_ERR,"PIDFILE: Could not open %s: %d: %s",pidfile,errno,strerror(errno)); + usage(); + } + + tmp = secure_getenv("FEED"); + if (tmp == NULL) { + syslog(LOG_ERR,"FEED is missing"); + usage(); + } + feed = atoi(tmp); + + tmp = secure_getenv("FILESAMPLERATE"); + if (tmp == NULL) { + syslog(LOG_ERR,"FILESAMPLERATE is missing"); + usage(); + } + filesamplerate = atoi(tmp); + + // Prime the monitor file + monitorfile = secure_getenv("MONITORFILE"); + monitorfd = open(monitorfile,O_WRONLY|O_CREAT|O_SYNC|O_TRUNC,0644); + if (monitorfd == -1) { + syslog(LOG_ERR,"MONITORFILE: Could not open %s: %d: %s",monitorfile,errno,strerror(errno)); + usage(); + } + result = write(monitorfd,&fileval_longlong,sizeof fileval_longlong); + if(result != 8) { + syslog(LOG_ERR,"MONITORFILE: Could not write %s: %d: %s",monitorfile,errno,strerror(errno)); + usage(); + } + close(monitorfd); + + tmp = secure_getenv("MINIMUM_AVAILABLE_MEM"); + if (tmp == NULL) { + syslog(LOG_ERR,"MINIMUM_AVAILABLE_MEM is missing"); + usage(); + } + minimum_available_mem = strtoull(tmp,NULL,0); + + tmp = secure_getenv("MINIMUM_FREEHIGH"); + if (tmp == NULL) { + syslog(LOG_ERR,"MINIMUM_FREEHIGH is missing"); + usage(); + } + minimum_freehigh = strtoull(tmp,NULL,0); + + + tmp = secure_getenv("MEMSAMPLERATE"); + if (tmp == NULL) { + syslog(LOG_ERR,"MEMSAMPLERATE is missing"); + usage(); + } + memsamplerate = atoi(tmp); + + tmp = secure_getenv("MEMSAMPLES"); + if (tmp == NULL) { + syslog(LOG_ERR,"MEMSAMPLES is missing"); + usage(); + } + memsamples = atoi(tmp); + available_samples = (char *)calloc(memsamples,sizeof available_samples[0]); + if(available_samples == NULL) { + syslog(LOG_ERR,"ERROR: Out of memory"); + exit(1); + } + freehigh_samples = (char *)calloc(memsamples,sizeof freehigh_samples[0]); + if(available_samples == NULL) { + syslog(LOG_ERR,"ERROR: Out of memory"); + exit(1); + } + memsample_idx = 0; + + tmp = secure_getenv("MEMFAILEDSAMPLES"); + if (tmp == NULL) { + syslog(LOG_ERR,"MEMFAILEDSAMPLES is missing"); + usage(); + } + memfailedsamples = atoi(tmp); + + tmp = secure_getenv("SHUTDOWNTIMEOUT"); + if (tmp == NULL) { + syslog(LOG_ERR,"SHUTDOWNTIMEOUT is missing"); + usage(); + } + shutdown_timeout = atoi(tmp); + + watchdogdevicepath = argv[1]; + syslog(LOG_ALERT,"%s: Version %s",name,VERSION); + syslog(LOG_ALERT,"All times in seconds, sizes in bytes"); + syslog(LOG_ALERT,"Watchdog Device: %s",watchdogdevicepath); + syslog(LOG_ALERT,"Watchdog Timout in Seconds: %d",timeout); + syslog(LOG_ALERT,"PID File: %s",pidfile); + syslog(LOG_ALERT,"Feed: %d",feed); + syslog(LOG_ALERT,"File Sample Rate %d",filesamplerate); + syslog(LOG_ALERT,"Monitor File (I/O health check) %s",monitorfile); + syslog(LOG_ALERT,"Minimum Available Memory %llu",minimum_available_mem); + syslog(LOG_ALERT,"Minimum Free High Memory %llu",minimum_freehigh); + syslog(LOG_ALERT,"Memory Sample Rate %d",memsamplerate); + syslog(LOG_ALERT,"Memory Samples Collected %d",memsamples); + syslog(LOG_ALERT,"Memory Failed Samples (maximum) %d",memfailedsamples); + syslog(LOG_ALERT,"Shudown timeout %d",shutdown_timeout); + result = clock_gettime(CLOCK_MONOTONIC, &t0); + if (result == -1) { + syslog(LOG_ERR,"ERROR: System Error: The system is not supporting MONOTONIC time"); + exit(1); + } + result = fork(); + switch(result) + { + case -1: + syslog(LOG_ERR,"ERROR: Could not fork: %d: %s", errno, strerror(errno)); + break; + case 0: + close(0); close(1); close(2); + setsid(); + break; + default: + _exit(0); + } + count = snprintf(pidstr,sizeof pidstr,"%u",(int)getpid()); + if ((count > 0) && (pidfd >= 0) && (count < sizeof pidstr)) { + pidstr[count] = '\n'; + result = write(pidfd,pidstr,count+1); + if (result == -1) + syslog(LOG_ERR,"ERROR: Could not write to PID file: %d: %s", errno, strerror(errno)); + } + close(pidfd); + + devfd = open(argv[1],O_RDWR); + if (devfd == -1) { + syslog(LOG_ERR,"feedwatchdog: Could not open %s: %s",argv[1],strerror(errno)); + usage(); + } + + clock_gettime(CLOCK_MONOTONIC, &t0); + feed_time = file_time = mem_time = t0; + syslog(LOG_DEBUG,"****INITIAL t0: size %d:%d %lu:%lu mem_time %lu:%lu",sizeof t0.tv_sec,sizeof t0.tv_nsec,t0.tv_sec,t0.tv_nsec,mem_time.tv_sec,mem_time.tv_nsec); + + ioctl(devfd, WDIOC_SETTIMEOUT, &timeout); + syslog(LOG_ERR,"The timeout was set to %d seconds",timeout); + + + while (1) { + clock_gettime(CLOCK_MONOTONIC, &t0); + if (timed_out(&t0,&feed_time)) { + ioctl(devfd, WDIOC_KEEPALIVE, 0); + feed_time.tv_sec += feed; // Next time to feed the watchdog + } + if (timed_out(&t0,&file_time)) { + unsigned long long readval; + monitorfd = open(monitorfile,O_RDWR|O_SYNC); + if(monitorfd == -1) { + syslog(LOG_ERR,"ERROR: Could not open %s: %d: %s",monitorfile,errno,strerror(errno)); + error_count++; + } else { + result = read(monitorfd,&readval,sizeof readval); + if(result == -1) { + syslog(LOG_ERR,"ERROR: Could not read %s: %d: %s",monitorfile,errno,strerror(errno)); + error_count++; + } + if (result != sizeof readval) { + syslog(LOG_ERR,"ERROR: %s: Expected to read %d, but read %d bytes",monitorfile,sizeof readval,result); + error_count++; + } + if (readval != fileval_longlong) { + syslog(LOG_ERR,"ERROR: %s: Expected to read %llu, but read %llu value",monitorfile,fileval_longlong,readval); + error_count++; + } + fileval_longlong++; + result = lseek(monitorfd,0,SEEK_SET); + if(result == -1) { + syslog(LOG_ERR,"ERROR: Could not rewind %s: %d: %s",monitorfile,errno,strerror(errno)); + error_count++; + } + result = write(monitorfd,&fileval_longlong,sizeof fileval_longlong); + if(result == -1) { + syslog(LOG_ERR,"ERROR: Could write %s: %d: %s",monitorfile,errno,strerror(errno)); + error_count++; + } + if(result != sizeof fileval_longlong) { + syslog(LOG_ERR,"ERROR: %s: Expected to write %d, but wrote %d bytes",monitorfile,sizeof fileval_longlong,result); + error_count++; + } + if(result == sizeof fileval_longlong) + file_time.tv_sec += filesamplerate; + close(monitorfd); + } // Good file descriptor for monitor file. + } // Monitor file timeout (write to monitor file) + + // Memory checks + if(timed_out(&t0,&mem_time)) { + unsigned long long testval; + int i; + int acount,fcount; + + syslog(LOG_DEBUG,"t0: %lu:%lu mem_time %lu:%lu",t0.tv_sec,t0.tv_nsec,mem_time.tv_sec,mem_time.tv_nsec); + sysinfo(&info); + testval = (info.freeram * info.mem_unit); + available_samples[memsample_idx] = (testval < minimum_available_mem); + testval = (info.freehigh * info.mem_unit); + freehigh_samples[memsample_idx] = (testval < minimum_freehigh); + memsample_idx++; + memsample_idx = (memsample_idx % memsamples); + acount = fcount = 0; + for (i=0;i memfailedsamples) { + syslog(LOG_ERR,"Memory Available failure: %llu, should be at least %llu",testval,minimum_available_mem); + exit(1); + } + if (fcount > memfailedsamples) { + syslog(LOG_ERR,"High Memory failure: %llu, should be at least %llu",testval,minimum_freehigh); + exit(1); + } + mem_time.tv_sec += memsamplerate; + } // Time to check memory. + // How long do we sleep? + min_time(&mem_time,&file_time,&feed_time,&stime); + clock_nanosleep(CLOCK_MONOTONIC,TIMER_ABSTIME,stime,NULL); + } // Loop forever. + return 100; /* NOTREACHED */ +} -- cgit v1.2.3