summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn Klug <john.klug@multitech.com>2021-01-19 17:21:32 -0600
committerJohn Klug <john.klug@multitech.com>2021-01-19 17:21:32 -0600
commitf8d0b344ae1b2dc3894c1a597c0565911b762742 (patch)
tree79e8f174ab88cd9c00258956f2cb2e6bed548956 /src
downloadsoftdog-mon-f8d0b344ae1b2dc3894c1a597c0565911b762742.tar.gz
softdog-mon-f8d0b344ae1b2dc3894c1a597c0565911b762742.tar.bz2
softdog-mon-f8d0b344ae1b2dc3894c1a597c0565911b762742.zip
softdog-mon for monitoring a system using kernel module "softdog"HEAD0.1master
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am6
-rw-r--r--src/hog.c35
-rw-r--r--src/softdog-mon.c372
3 files changed, 413 insertions, 0 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644
index 0000000..dba3d24
--- /dev/null
+++ b/src/Makefile.am
@@ -0,0 +1,6 @@
+## Process this file with automake to produce Makefile.in
+AUTOMAKE_OPTIONS = gnu
+AM_CFLAGS = -Wall
+
+sbin_PROGRAMS = softdog-mon hog
+
diff --git a/src/hog.c b/src/hog.c
new file mode 100644
index 0000000..d887f4d
--- /dev/null
+++ b/src/hog.c
@@ -0,0 +1,35 @@
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+// Memory hog
+int
+main(int argc, const char *argv[])
+{
+ int j,k;
+ char *p;
+ int pid;
+ int mallocsize;
+
+ if (argc < 2) {
+ fprintf(stderr,"Need malloc size parameter\n");
+ exit(1);
+ }
+ mallocsize = atoi(argv[1]);
+ for (k=0; k<5; k++) {
+ p = malloc(mallocsize);
+ if (p)
+ for(j=0;j<mallocsize;j++)
+ p[j] = j;
+ pid = fork();
+
+ if (pid)
+ fprintf(stderr,"pid=%d\n",pid);
+ }
+ pid = getpid();
+ for(j=0;j<mallocsize;j++)
+ p[j] = pid;
+
+ pause();
+ // NOTREACHED
+ return (0);
+}
diff --git a/src/softdog-mon.c b/src/softdog-mon.c
new file mode 100644
index 0000000..75591dd
--- /dev/null
+++ b/src/softdog-mon.c
@@ -0,0 +1,372 @@
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/sysinfo.h>
+#include <linux/watchdog.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <config.h>
+#include <errno.h>
+#include <syslog.h>
+#include <time.h>
+#include <signal.h>
+
+/* This program does everything in seconds granularity.
+ * The time to accomplish a task has the number of nanoseconds
+ * added in to specify the time for the event to be accomplished.
+ */
+
+char *name = PACKAGE;
+
+void usage()
+{
+ syslog(LOG_ERR, "%s (" PACKAGE ") " VERSION " (" __DATE__ " " __TIME__ ")\n", name);
+ syslog(LOG_ERR, "Copyright (C) 2021 by Multi-Tech Systems\n");
+
+ syslog(LOG_ERR,"USAGE: feedwatchdog device timeout");
+ syslog(LOG_ERR,"Times below are in seconds");
+ syslog(LOG_ERR,"ENVIRONMENT:");
+ syslog(LOG_ERR," PIDFILE");
+ syslog(LOG_ERR," FEED How often to feed watchdog in seconds");
+ syslog(LOG_ERR," FILESAMPLERATE How often to open/read/write/close test file");
+ syslog(LOG_ERR," MONITORFILE Path to test file");
+ syslog(LOG_ERR," MINIMUM_AVAILABLE_MEM Memory Available");
+ syslog(LOG_ERR," MINIMUM_FREEHIGH Available high memory");
+ syslog(LOG_ERR," MEMSAMPLERATE Sample rate to test memory");
+ syslog(LOG_ERR," MEMSAMPLES Samples to accumulate");
+ syslog(LOG_ERR," MEMFAILEDSAMPLES Number of samples below threshold before reboot");
+ syslog(LOG_ERR," SHUTDOWNTIMEOUT Set watchdog timeout seconds at SIGTERM signal");
+ exit(1);
+}
+
+
+// Returns true if earlier is before or matches later.
+int
+timed_out(struct timespec *later, struct timespec *earlier)
+{
+ if (later->tv_sec > earlier->tv_sec)
+ return 1; // Time has elapsed by seconds
+
+ if (later->tv_sec == earlier->tv_sec) {
+ if (later->tv_nsec >= earlier->tv_nsec)
+ return 1; // Seconds match, and nano-seconds have elapsed
+ }
+ return 0;
+}
+
+
+// Return the soonest of t1, t2, and t3. Prefers t3 to t2. Prefers t2 to t1.
+void
+min_time(struct timespec *t1, struct timespec *t2, struct timespec *t3, struct timespec **soonest) {
+ if(timed_out(t1,t2))
+ *soonest = t2;
+ else
+ *soonest = t1;
+ if(timed_out(*soonest,t3))
+ *soonest = t3;
+}
+
+
+char *pidfile;
+int feed; // Seconds between feedings of the watchdog
+int filesamplerate;
+unsigned long long fileval_longlong = 0x5555555555555555ULL;
+int timeout;
+char *monitorfile;
+unsigned long long minimum_available_mem;
+unsigned long long minimum_freehigh;
+int memsamplerate;
+int memsamples;
+int memfailedsamples;
+char pidstr[32];
+char *available_samples;
+char *freehigh_samples;
+int memsample_idx;
+struct timespec t0;
+struct timespec feed_time; // When to next feed the watchdog
+struct timespec file_time; // When to next write to the file
+struct timespec mem_time; // When to next test memory
+int error_count; // Cumulative errors
+int shutdown_timeout;
+int devfd = -1;
+
+void
+sigterm_handler(int sig)
+{
+ struct sigaction hdlr_action;
+ // Shutdown could be long if doing a flash upgrade
+ if(devfd != -1)
+ ioctl(devfd, WDIOC_SETTIMEOUT, &shutdown_timeout);
+ /* syslog is not safe in a signal handler */
+ memset(&hdlr_action,0,sizeof hdlr_action);
+ hdlr_action.sa_handler = SIG_DFL;
+ sigaction(SIGTERM,&hdlr_action,NULL);
+ kill(getpid(),SIGTERM);
+ _exit(0);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int result, count;
+ char *tmp;
+ int pidfd,monitorfd;
+ char *watchdogdevicepath;
+ struct timespec t0, *stime = NULL;
+ struct sysinfo info;
+ struct sigaction action;
+
+ openlog(name,LOG_PERROR,LOG_DAEMON);
+
+ memset(&action,0,sizeof action);
+ action.sa_handler = sigterm_handler;
+ sigaction(SIGTERM,&action,NULL);
+
+ if (argc < 3)
+ usage();
+ timeout = atoi(argv[2]);
+ if (timeout < 1) {
+ syslog(LOG_ERR,"feedwatchdog: timeout must be at least 1");
+ usage();
+ }
+ pidfile = secure_getenv("PIDFILE");
+ if(pidfile == NULL) {
+ pidfile = "/run/softdog-mon.pid";
+ syslog(LOG_ERR,"feedwatchdog: pidfile is NULL");
+ pidfd = open(pidfile,O_WRONLY|O_CREAT|O_TRUNC,0644);
+ } else {
+ syslog(LOG_ERR,"feedwatchdog: pidfile is %s",pidfile);
+ pidfd = open(pidfile,O_WRONLY|O_CREAT|O_TRUNC,0644);
+ }
+
+ if (pidfd == -1) {
+ syslog(LOG_ERR,"PIDFILE: Could not open %s: %d: %s",pidfile,errno,strerror(errno));
+ usage();
+ }
+
+ tmp = secure_getenv("FEED");
+ if (tmp == NULL) {
+ syslog(LOG_ERR,"FEED is missing");
+ usage();
+ }
+ feed = atoi(tmp);
+
+ tmp = secure_getenv("FILESAMPLERATE");
+ if (tmp == NULL) {
+ syslog(LOG_ERR,"FILESAMPLERATE is missing");
+ usage();
+ }
+ filesamplerate = atoi(tmp);
+
+ // Prime the monitor file
+ monitorfile = secure_getenv("MONITORFILE");
+ monitorfd = open(monitorfile,O_WRONLY|O_CREAT|O_SYNC|O_TRUNC,0644);
+ if (monitorfd == -1) {
+ syslog(LOG_ERR,"MONITORFILE: Could not open %s: %d: %s",monitorfile,errno,strerror(errno));
+ usage();
+ }
+ result = write(monitorfd,&fileval_longlong,sizeof fileval_longlong);
+ if(result != 8) {
+ syslog(LOG_ERR,"MONITORFILE: Could not write %s: %d: %s",monitorfile,errno,strerror(errno));
+ usage();
+ }
+ close(monitorfd);
+
+ tmp = secure_getenv("MINIMUM_AVAILABLE_MEM");
+ if (tmp == NULL) {
+ syslog(LOG_ERR,"MINIMUM_AVAILABLE_MEM is missing");
+ usage();
+ }
+ minimum_available_mem = strtoull(tmp,NULL,0);
+
+ tmp = secure_getenv("MINIMUM_FREEHIGH");
+ if (tmp == NULL) {
+ syslog(LOG_ERR,"MINIMUM_FREEHIGH is missing");
+ usage();
+ }
+ minimum_freehigh = strtoull(tmp,NULL,0);
+
+
+ tmp = secure_getenv("MEMSAMPLERATE");
+ if (tmp == NULL) {
+ syslog(LOG_ERR,"MEMSAMPLERATE is missing");
+ usage();
+ }
+ memsamplerate = atoi(tmp);
+
+ tmp = secure_getenv("MEMSAMPLES");
+ if (tmp == NULL) {
+ syslog(LOG_ERR,"MEMSAMPLES is missing");
+ usage();
+ }
+ memsamples = atoi(tmp);
+ available_samples = (char *)calloc(memsamples,sizeof available_samples[0]);
+ if(available_samples == NULL) {
+ syslog(LOG_ERR,"ERROR: Out of memory");
+ exit(1);
+ }
+ freehigh_samples = (char *)calloc(memsamples,sizeof freehigh_samples[0]);
+ if(available_samples == NULL) {
+ syslog(LOG_ERR,"ERROR: Out of memory");
+ exit(1);
+ }
+ memsample_idx = 0;
+
+ tmp = secure_getenv("MEMFAILEDSAMPLES");
+ if (tmp == NULL) {
+ syslog(LOG_ERR,"MEMFAILEDSAMPLES is missing");
+ usage();
+ }
+ memfailedsamples = atoi(tmp);
+
+ tmp = secure_getenv("SHUTDOWNTIMEOUT");
+ if (tmp == NULL) {
+ syslog(LOG_ERR,"SHUTDOWNTIMEOUT is missing");
+ usage();
+ }
+ shutdown_timeout = atoi(tmp);
+
+ watchdogdevicepath = argv[1];
+ syslog(LOG_ALERT,"%s: Version %s",name,VERSION);
+ syslog(LOG_ALERT,"All times in seconds, sizes in bytes");
+ syslog(LOG_ALERT,"Watchdog Device: %s",watchdogdevicepath);
+ syslog(LOG_ALERT,"Watchdog Timout in Seconds: %d",timeout);
+ syslog(LOG_ALERT,"PID File: %s",pidfile);
+ syslog(LOG_ALERT,"Feed: %d",feed);
+ syslog(LOG_ALERT,"File Sample Rate %d",filesamplerate);
+ syslog(LOG_ALERT,"Monitor File (I/O health check) %s",monitorfile);
+ syslog(LOG_ALERT,"Minimum Available Memory %llu",minimum_available_mem);
+ syslog(LOG_ALERT,"Minimum Free High Memory %llu",minimum_freehigh);
+ syslog(LOG_ALERT,"Memory Sample Rate %d",memsamplerate);
+ syslog(LOG_ALERT,"Memory Samples Collected %d",memsamples);
+ syslog(LOG_ALERT,"Memory Failed Samples (maximum) %d",memfailedsamples);
+ syslog(LOG_ALERT,"Shudown timeout %d",shutdown_timeout);
+ result = clock_gettime(CLOCK_MONOTONIC, &t0);
+ if (result == -1) {
+ syslog(LOG_ERR,"ERROR: System Error: The system is not supporting MONOTONIC time");
+ exit(1);
+ }
+ result = fork();
+ switch(result)
+ {
+ case -1:
+ syslog(LOG_ERR,"ERROR: Could not fork: %d: %s", errno, strerror(errno));
+ break;
+ case 0:
+ close(0); close(1); close(2);
+ setsid();
+ break;
+ default:
+ _exit(0);
+ }
+ count = snprintf(pidstr,sizeof pidstr,"%u",(int)getpid());
+ if ((count > 0) && (pidfd >= 0) && (count < sizeof pidstr)) {
+ pidstr[count] = '\n';
+ result = write(pidfd,pidstr,count+1);
+ if (result == -1)
+ syslog(LOG_ERR,"ERROR: Could not write to PID file: %d: %s", errno, strerror(errno));
+ }
+ close(pidfd);
+
+ devfd = open(argv[1],O_RDWR);
+ if (devfd == -1) {
+ syslog(LOG_ERR,"feedwatchdog: Could not open %s: %s",argv[1],strerror(errno));
+ usage();
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ feed_time = file_time = mem_time = t0;
+ syslog(LOG_DEBUG,"****INITIAL t0: size %d:%d %lu:%lu mem_time %lu:%lu",sizeof t0.tv_sec,sizeof t0.tv_nsec,t0.tv_sec,t0.tv_nsec,mem_time.tv_sec,mem_time.tv_nsec);
+
+ ioctl(devfd, WDIOC_SETTIMEOUT, &timeout);
+ syslog(LOG_ERR,"The timeout was set to %d seconds",timeout);
+
+
+ while (1) {
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ if (timed_out(&t0,&feed_time)) {
+ ioctl(devfd, WDIOC_KEEPALIVE, 0);
+ feed_time.tv_sec += feed; // Next time to feed the watchdog
+ }
+ if (timed_out(&t0,&file_time)) {
+ unsigned long long readval;
+ monitorfd = open(monitorfile,O_RDWR|O_SYNC);
+ if(monitorfd == -1) {
+ syslog(LOG_ERR,"ERROR: Could not open %s: %d: %s",monitorfile,errno,strerror(errno));
+ error_count++;
+ } else {
+ result = read(monitorfd,&readval,sizeof readval);
+ if(result == -1) {
+ syslog(LOG_ERR,"ERROR: Could not read %s: %d: %s",monitorfile,errno,strerror(errno));
+ error_count++;
+ }
+ if (result != sizeof readval) {
+ syslog(LOG_ERR,"ERROR: %s: Expected to read %d, but read %d bytes",monitorfile,sizeof readval,result);
+ error_count++;
+ }
+ if (readval != fileval_longlong) {
+ syslog(LOG_ERR,"ERROR: %s: Expected to read %llu, but read %llu value",monitorfile,fileval_longlong,readval);
+ error_count++;
+ }
+ fileval_longlong++;
+ result = lseek(monitorfd,0,SEEK_SET);
+ if(result == -1) {
+ syslog(LOG_ERR,"ERROR: Could not rewind %s: %d: %s",monitorfile,errno,strerror(errno));
+ error_count++;
+ }
+ result = write(monitorfd,&fileval_longlong,sizeof fileval_longlong);
+ if(result == -1) {
+ syslog(LOG_ERR,"ERROR: Could write %s: %d: %s",monitorfile,errno,strerror(errno));
+ error_count++;
+ }
+ if(result != sizeof fileval_longlong) {
+ syslog(LOG_ERR,"ERROR: %s: Expected to write %d, but wrote %d bytes",monitorfile,sizeof fileval_longlong,result);
+ error_count++;
+ }
+ if(result == sizeof fileval_longlong)
+ file_time.tv_sec += filesamplerate;
+ close(monitorfd);
+ } // Good file descriptor for monitor file.
+ } // Monitor file timeout (write to monitor file)
+
+ // Memory checks
+ if(timed_out(&t0,&mem_time)) {
+ unsigned long long testval;
+ int i;
+ int acount,fcount;
+
+ syslog(LOG_DEBUG,"t0: %lu:%lu mem_time %lu:%lu",t0.tv_sec,t0.tv_nsec,mem_time.tv_sec,mem_time.tv_nsec);
+ sysinfo(&info);
+ testval = (info.freeram * info.mem_unit);
+ available_samples[memsample_idx] = (testval < minimum_available_mem);
+ testval = (info.freehigh * info.mem_unit);
+ freehigh_samples[memsample_idx] = (testval < minimum_freehigh);
+ memsample_idx++;
+ memsample_idx = (memsample_idx % memsamples);
+ acount = fcount = 0;
+ for (i=0;i<memsamples;i++) {
+ acount += available_samples[i];
+ fcount += freehigh_samples[i];
+ }
+ syslog(LOG_DEBUG,"mem samples: acount:%u fcount:%u memsamples:%u",acount,fcount,memsamples);
+ if (acount > memfailedsamples) {
+ syslog(LOG_ERR,"Memory Available failure: %llu, should be at least %llu",testval,minimum_available_mem);
+ exit(1);
+ }
+ if (fcount > memfailedsamples) {
+ syslog(LOG_ERR,"High Memory failure: %llu, should be at least %llu",testval,minimum_freehigh);
+ exit(1);
+ }
+ mem_time.tv_sec += memsamplerate;
+ } // Time to check memory.
+ // How long do we sleep?
+ min_time(&mem_time,&file_time,&feed_time,&stime);
+ clock_nanosleep(CLOCK_MONOTONIC,TIMER_ABSTIME,stime,NULL);
+ } // Loop forever.
+ return 100; /* NOTREACHED */
+}