/* * Copyright (C), 2000-2007 by the monit project group. * All Rights Reserved. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "config.h" #ifdef HAVE_STRING_H #include #endif #ifdef HAVE_STRINGS_H #include #endif #ifdef HAVE_SYS_TYPES_H #include #endif #ifdef HAVE_SYS_STAT_H #include #endif #ifdef HAVE_UNISTD_H #include #endif #ifdef HAVE_DIRENT_H #include #endif #include "monitor.h" #include "alert.h" #include "event.h" #include "process.h" /** * Implementation of the event interface. * * @author Jan-Henrik Haukeland, * @author Martin Pala * @version \$Id: event.c,v 1.66 2007/07/25 12:54:28 hauk Exp $ * @file */ /* ------------------------------------------------------------- Definitions */ EventTable_T Event_Table[]= { {EVENT_CHANGED, "Changed", "Changed not"}, {EVENT_CHECKSUM, "Checksum failed", "Checksum passed"}, {EVENT_CONNECTION, "Connection failed", "Connection passed"}, {EVENT_DATA, "Data access error", "Data access succeeded"}, {EVENT_EXEC, "Execution failed", "Execution succeeded"}, {EVENT_GID, "GID failed", "GID passed"}, {EVENT_ICMP, "ICMP failed", "ICMP passed"}, {EVENT_INSTANCE, "Monit instance changed", "Monit instance changed not"}, {EVENT_INVALID, "Invalid type", "Type passed"}, {EVENT_MATCH, "Regex match", "No regex match"}, {EVENT_NONEXIST, "Does not exist", "Exists"}, {EVENT_PERMISSION, "Permission failed", "Permission passed"}, {EVENT_RESOURCE, "Resource limit matched", "Resource limit passed"}, {EVENT_SIZE, "Size failed", "Size passed"}, {EVENT_TIMEOUT, "Timeout", "Timeout recovery"}, {EVENT_TIMESTAMP, "Timestamp failed", "Timestamp passed"}, {EVENT_UID, "UID failed", "UID passed"}, /* Virtual events */ {EVENT_NULL, "No Event", "No Event"}, }; /* -------------------------------------------------------------- Prototypes */ static void handle_event(Event_T); static void handle_action(Event_T, Action_T); static void Event_queue_add(Event_T); /* ------------------------------------------------------------------ Public */ /** * Post a new Event * @param service The Service the event belongs to * @param id The event identification * @param state The event state * @param action Description of the event action * @param s Optional message describing the event */ void Event_post(Service_T service, long id, short state, EventAction_T action, char *s, ...) { Event_T e = service->eventlist; ASSERT(service); ASSERT(action); ASSERT(state == STATE_FAILED || state == STATE_PASSED); if(e == NULL) { /* Only first failed event can initialize the queue for given event type, * thus passed events are ignored until first error. However, in the case * that the error flag is set for the passed event, we will allow it (i.e. * event queue was flushed during monit reload and the service was in * failed state before reload) */ if(state != STATE_FAILED && !(service->error & id)) return; /* Initialize event list and add first event. The manadatory informations * are cloned so the event is as standalone as possible and may be saved * to the queue without the dependency on the original service, thus * persistent and managable across monit restarts */ NEW(e); e->id = id; e->collected = time(NULL); e->source = xstrdup(service->name); e->group = service->group?xstrdup(service->group):xstrdup(""); e->mode = service->mode; e->type = service->type; e->state = STATE_INIT; e->state_map = state; e->action = action; if(s) { long l; va_list ap; va_start(ap, s); e->message = Util_formatString(s, ap, &l); va_end(ap); } pthread_mutex_init(&e->mutex, NULL); service->eventlist = e; } else { /* Try to find the event with the same origin and type identification. * Each service and each test have its own custom actions object, so * we share actions object address to identify event source. */ do { if(e->action == action && e->id == id) { LOCK(e->mutex) e->collected = time(NULL); /* Shift the existing event flags to the left * and set the first bit based on actual state */ e->state_map <<= 1; e->state_map |= state; /* Update the message */ if(s) { long l; va_list ap; FREE(e->message); va_start(ap, s); e->message = Util_formatString(s, ap, &l); va_end(ap); } END_LOCK; break; } e = e->next; } while(e); if(!e) { /* Only first failed event can initialize the queue for given event type, * thus passed events are ignored until first error */ if(state != STATE_FAILED) return; /* Event was not found in the pending events list, we will add it. * The manadatory informations are cloned so the event is as standalone * as possible and may be saved to the queue without the dependency on * the original service, thus persistent and managable across monit * restarts */ NEW(e); e->id = id; e->collected = time(NULL); e->source = xstrdup(service->name); e->group = service->group?xstrdup(service->group):xstrdup(""); e->mode = service->mode; e->type = service->type; e->state = STATE_INIT; e->state_map = state; e->action = action; if(s) { long l; va_list ap; va_start(ap, s); e->message = Util_formatString(s, ap, &l); va_end(ap); } pthread_mutex_init(&e->mutex, NULL); e->next = service->eventlist; service->eventlist = e; } } e->state_changed = Event_check_state(e, state); /* In the case that the state changed, update it and reset the counter */ if(e->state_changed) { e->state = state; e->count = 1; } else { e->count++; } LOCK(e->mutex) handle_event(e); END_LOCK; } /* -------------------------------------------------------------- Properties */ /** * Get the Service where the event orginated * @param E An event object * @return The Service where the event orginated */ Service_T Event_get_source(Event_T E) { Service_T s = NULL; ASSERT(E); if(!(s = Util_getService(E->source))) { LogError("Service %s not found in monit configuration\n", E->source); } return s; } /** * Get the Service name where the event orginated * @param E An event object * @return The Service name where the event orginated */ char *Event_get_source_name(Event_T E) { ASSERT(E); return (E->source); } /** * Get the group name of the service where the event orginated * @param E An event object * @return The group name of the service where the event orginated */ char *Event_get_source_group(Event_T E) { ASSERT(E); return (E->group); } /** * Get the service type of the service where the event orginated * @param E An event object * @return The service type of the service where the event orginated */ int Event_get_source_type(Event_T E) { ASSERT(E); return (E->type); } /** * Get the Event timestamp * @param E An event object * @return The Event timestamp */ time_t Event_get_collected(Event_T E) { ASSERT(E); return E->collected; } /** * Get the Event raw state * @param E An event object * @return The Event raw state */ short Event_get_state(Event_T E) { ASSERT(E); return E->state; } /** * Return the actual event state based on event state bitmap * and event ratio needed to trigger the state change * @param E An event object * @param S Actual posted state * @return The Event raw state */ short Event_check_state(Event_T E, short S) { int i; int count = 0; Action_T action; Service_T service; long long flag; ASSERT(E); if(!(service = Event_get_source(E))) return TRUE; /* Only the true failed state condition can change the initial state */ if(S == STATE_PASSED && E->state == STATE_INIT && !(service->error & E->id)) { return FALSE; } action = (S == STATE_PASSED)?E->action->passed:E->action->failed; /* Compare as many bits as cycles able to trigger the action */ for(i = 0; i < action->cycles; i++) { /* Check the state of the particular cycle given by the bit position */ flag = (E->state_map >> i) & 0x1; /* Count occurences of the posted state */ if(flag == S) { count++; } } if(count >= action->count && S != E->state) { return TRUE; } return FALSE; } /** * Get the Event type * @param E An event object * @return The Event type */ int Event_get_id(Event_T E) { ASSERT(E); return E->id; } /** * Get the optionally Event message describing why the event was * fired. * @param E An event object * @return The Event message. May be NULL */ const char *Event_get_message(Event_T E) { ASSERT(E); return E->message; } /** * Get a textual description of actual event type. For instance if the * event type is possitive EVENT_TIMESTAMP, the textual description is * "Timestamp error". Likewise if the event type is negative EVENT_CHECKSUM * the textual description is "Checksum recovery" and so on. * @param E An event object * @return A string describing the event type in clear text. If the * event type is not found NULL is returned. */ const char *Event_get_description(Event_T E) { EventTable_T *et= Event_Table; ASSERT(E); while((*et).id) { if(E->id == (*et).id) { return E->state?(*et).description_failed:(*et).description_passed; } et++; } return NULL; } /** * Get an event action id. * @param E An event object * @return An action id */ short Event_get_action(Event_T E) { short id; Action_T A; ASSERT(E); A = E->state?E->action->failed:E->action->passed; /* In the case of passive mode we replace the description of start, stop * or restart action for alert action, because these actions are passive in * this mode */ id= (E->mode == MODE_PASSIVE && ((A->id == ACTION_START)|| (A->id == ACTION_STOP) || (A->id == ACTION_RESTART)) )?ACTION_ALERT:A->id; return id; } /** * Get a textual description of actual event action. For instance if the * event type is possitive EVENT_NONEXIST, the textual description of * failed state related action is "restart". Likewise if the event type is * negative EVENT_CHECKSUM the textual description of recovery related action * is "alert" and so on. * @param E An event object * @return A string describing the event type in clear text. If the * event type is not found NULL is returned. */ const char *Event_get_action_description(Event_T E) { ASSERT(E); return actionnames[Event_get_action(E)]; } /** * Reprocess the partialy handled event queue */ void Event_queue_process() { DIR *dir = NULL; FILE *file = NULL; struct dirent *de = NULL; EventAction_T ea = NULL; Action_T a = NULL; /* return in the case that the eventqueue is not enabled or empty */ if( !Run.eventlist_dir || ( !Run.handler_init && !Run.handler_queue[HANDLER_ALERT] && !Run.handler_queue[HANDLER_COLLECTOR] ) ) { return; } if(! (dir = opendir(Run.eventlist_dir)) ) { if(errno != ENOENT) { LogError("%s: cannot open the directory %s -- %s\n", prog, Run.eventlist_dir, STRERROR); } return; } if((de = readdir(dir))) { DEBUG("Processing postponed events queue\n"); } NEW(ea); NEW(a); while(de) { int size; int *version = NULL; short *action = NULL; Event_T e = NULL; struct stat st; char file_name[STRLEN]; /* In the case that all handlers failed, skip the further processing in * this cycle. Alert handler is currently defined anytime (either * explicitly or localhost by default) */ if( (Run.collectors && FLAG(Run.handler_flag, HANDLER_COLLECTOR) && FLAG(Run.handler_flag, HANDLER_ALERT) ) || FLAG(Run.handler_flag, HANDLER_ALERT)) { break; } snprintf(file_name, STRLEN, "%s/%s", Run.eventlist_dir, de->d_name); if(!stat(file_name, &st) && S_ISREG(st.st_mode)) { DEBUG("%s: processing queued event %s\n", prog, file_name); if(! (file = fopen(file_name, "r")) ) { LogError("%s: Processing failed - cannot open the event file %s -- %s\n", prog, file_name, STRERROR); goto error1; } /* read event structure version */ if(!(version = File_readQueue(file, &size)) || size != sizeof(int)) { LogError("skipping %s - unknown data format\n", file_name, *version); goto error2; } if(*version != EVENT_VERSION) { LogError("Aborting event %s - incompatible data format version %d\n", file_name, *version); goto error2; } /* read event structure */ if(!(e = File_readQueue(file, &size)) || size != sizeof(*e)) goto error2; /* read source */ if(!(e->source = File_readQueue(file, &size))) goto error3; /* read group */ if(!(e->group = File_readQueue(file, &size))) goto error3; /* read message */ if(!(e->message = File_readQueue(file, &size))) goto error3; /* read event action */ if(!(action = File_readQueue(file, &size)) || size != sizeof(short)) goto error3; a->id = *action; if(e->state == STATE_FAILED) { ea->failed = a; } else { ea->passed = a; } e->action = ea; /* Retry all remaining handlers */ /* alert */ if(e->flag & HANDLER_ALERT) { if(Run.handler_init) { Run.handler_queue[HANDLER_ALERT]++; } if((Run.handler_flag & HANDLER_ALERT) != HANDLER_ALERT) { if( handle_alert(e) != HANDLER_ALERT ) { e->flag &= ~HANDLER_ALERT; Run.handler_queue[HANDLER_ALERT]--; } else { LogError("Alert handler failed, retry scheduled for next cycle\n"); Run.handler_flag |= HANDLER_ALERT; } } } /* collector */ if(e->flag & HANDLER_COLLECTOR) { if(Run.handler_init) { Run.handler_queue[HANDLER_COLLECTOR]++; } if((Run.handler_flag & HANDLER_COLLECTOR) != HANDLER_COLLECTOR) { if( handle_collector(e) != HANDLER_COLLECTOR ) { e->flag &= ~HANDLER_COLLECTOR; Run.handler_queue[HANDLER_COLLECTOR]--; } else { LogError("Collector handler failed, retry scheduled for next cycle\n"); Run.handler_flag |= HANDLER_COLLECTOR; } } } /* If no error persists, remove it from the queue */ if(e->flag == HANDLER_PASSED) { DEBUG("Removing event %s from the queue for later external delivery\n", file_name); unlink(file_name); } error3: FREE(e->source); FREE(e->group); FREE(e->message); FREE(e); FREE(action); error2: FREE(version); fclose(file); } error1: de = readdir(dir); } Run.handler_init = FALSE; closedir(dir); FREE(a); FREE(ea); return; } /* ----------------------------------------------------------------- Private */ /* * Handle the event * @param E An event */ static void handle_event(Event_T E) { Service_T S; ASSERT(E); ASSERT(E->action); ASSERT(E->action->failed); ASSERT(E->action->passed); /* We will handle only first passed event, recurrent passed events * or insufficient passed events during failed service state are * ignored. Failed events are handled each time. */ if(!E->state_changed && (E->state == STATE_PASSED || ((E->state_map & 0x1) ^ 0x1))) { return; } if(E->message) { /* In the case that the service state is yet initializing and error * occured, log it and exit. Passed events in init state are not * logged. */ if(E->state != STATE_INIT || E->state_map & 0x1) { if(E->id == EVENT_INSTANCE || E->state == STATE_PASSED) { LogInfo("%s\n", E->message); } else { LogError("%s\n", E->message); } } if(E->state == STATE_INIT) { return; } } S = Event_get_source(E); if(!S) { LogError("Event handling aborted\n"); return; } if(E->state == STATE_FAILED) { S->error |= E->id; handle_action(E, E->action->failed); } else { S->error &= ~E->id; handle_action(E, E->action->passed); } /* Possible event state change was handled so we will reset the flag. */ E->state_changed = FALSE; } static void handle_action(Event_T E, Action_T A) { Service_T s; ASSERT(E); ASSERT(A); E->flag = HANDLER_PASSED; if(A->id == ACTION_IGNORE) { return; } /* Alert and collector event notification are common actions */ E->flag |= handle_alert(E); E->flag |= handle_collector(E); /* In the case that some subhandler failed, enqueue the event for * partial reprocessing */ if(E->flag != HANDLER_PASSED) { if(Run.eventlist_dir) { Event_queue_add(E); } else { LogError("Aborting event\n"); } } if(!(s = Event_get_source(E))) { LogError("Event action handling aborted\n"); return; } if(A->id == ACTION_ALERT) { return; /* Already handled */ } else if(A->id == ACTION_EXEC) { spawn(s, A->exec, Event_get_description(E)); return; } else { if(s->def_timeout && (A->id == ACTION_START || A->id == ACTION_RESTART)) { s->nstart++; } if(s->mode == MODE_PASSIVE && (A->id == ACTION_START || A->id == ACTION_STOP || A->id == ACTION_RESTART)) { return; } control_service(s->name, A->id); } } /** * Add the partialy handled event to the global queue * @param E An event object */ static void Event_queue_add(Event_T E) { FILE *file = NULL; char file_name[STRLEN]; int version = EVENT_VERSION; short action = Event_get_action(E); int rv = FALSE; mode_t mask; sigset_t ns; sigset_t os; ASSERT(E); ASSERT(E->flag != HANDLER_PASSED); if(!File_checkQueueDirectory(Run.eventlist_dir, 0700)) { LogError("%s: Aborting event - cannot access the directory %s\n", prog, Run.eventlist_dir); return; } if(!File_checkQueueLimit(Run.eventlist_dir, Run.eventlist_slots)) { LogError("%s: Aborting event - queue over quota\n", prog); return; } set_signal_block(&ns, &os); /* compose the file name of actual timestamp and service name */ snprintf(file_name, STRLEN, "%s/%ld_%s", Run.eventlist_dir, (long int)time(NULL), E->source); DEBUG("%s: Adding event to the queue file %s for later delivery\n", prog, file_name); mask = umask(QUEUEMASK); file = fopen(file_name, "w"); umask(mask); if(! file) { LogError("%s: Aborting event - cannot open the event file %s -- %s\n", prog, file_name, STRERROR); return; } /* write event structure version */ if(!(rv = File_writeQueue(file, &version, sizeof(int)))) goto error; /* write event structure */ if(!(rv = File_writeQueue(file, E, sizeof(*E)))) goto error; /* write source */ if(!(rv = File_writeQueue(file, E->source, E->source?strlen(E->source)+1:0))) goto error; /* write group */ if(!(rv = File_writeQueue(file, E->group, E->group?strlen(E->group)+1:0))) goto error; /* write message */ if(!(rv = File_writeQueue(file, E->message, E->message?strlen(E->message)+1:0))) goto error; /* write event action */ if(!(rv = File_writeQueue(file, &action, sizeof(short)))) goto error; error: if(!rv) { LogError("%s: Aborting event - unable to save event information to %s\n", prog, file_name); unlink(file_name); } else { if(!Run.handler_init && E->flag & HANDLER_ALERT) { Run.handler_queue[HANDLER_ALERT]++; } if(!Run.handler_init && E->flag & HANDLER_COLLECTOR) { Run.handler_queue[HANDLER_COLLECTOR]++; } fclose(file); } unset_signal_block(&os); return; }