1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Disk Monitor
  29  */
  30 #include <sys/types.h>
  31 #include <sys/stat.h>
  32 #include <fcntl.h>
  33 #include <time.h>
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <strings.h>
  37 #include <stdarg.h>
  38 #include <errno.h>
  39 #include <signal.h>
  40 #include <unistd.h>
  41 #include <pthread.h>
  42 #include <libnvpair.h>
  43 #include <fm/fmd_api.h>
  44 #include <fm/fmd_fmri.h>
  45 #include <sys/fm/protocol.h>
  46 #include <sys/fm/io/disk.h>
  47 #include <fm/libtopo.h>
  48 
  49 #include "disk_monitor.h"
  50 #include "hotplug_mgr.h"
  51 #include "schg_mgr.h"
  52 #include "topo_gather.h"
  53 #include "dm_platform.h"
  54 
  55 #define THIS_FMD_MODULE_NAME "disk-monitor"
  56 
  57 static enum disk_init_state {
  58         INIT_STATE_NONE = 0,
  59         STATE_CHANGE_MGR_INITTED = 2,
  60         HOTPLUG_MGR_INITTED = 4
  61 } g_init_state = INIT_STATE_NONE;
  62 
  63 typedef enum {
  64         LT_SUSPECT,
  65         LT_REPAIRED
  66 } fm_list_type_t;
  67 
  68 /*
  69  * Global verbosity flag -- controls chattiness of debug messages and
  70  * warnings.  Its value is determined by the fmd property "log-level"
  71  * settable in the DE's .conf file.
  72  */
  73 log_class_t                     g_verbose = 0;
  74 cfgdata_t                       *config_data = NULL;
  75 fmd_hdl_t                       *g_fm_hdl = NULL;
  76 
  77 static const fmd_prop_t         fmd_props[];
  78 
  79 static void
  80 diskmon_teardown_all(void)
  81 {
  82         cleanup_hotplug_manager();
  83         cleanup_state_change_manager(config_data);
  84         config_fini();
  85 }
  86 
  87 static int
  88 count_disks(diskmon_t *disklistp)
  89 {
  90         int i = 0;
  91 
  92         while (disklistp != NULL) {
  93                 i++;
  94                 disklistp = disklistp->next;
  95         }
  96 
  97         return (i);
  98 }
  99 
 100 static int
 101 diskmon_init(void)
 102 {
 103         /*
 104          * Block the generation of state change events (generated by the
 105          * hotplug manager thread) here; they will be unblocked after the
 106          * state change manager thread is ready to accept state changes
 107          * (shortly after it starts).
 108          */
 109         block_state_change_events();
 110 
 111         if (dm_platform_init() != 0)
 112                 goto cleanup;
 113 
 114         if (init_hotplug_manager() != 0)
 115                 goto cleanup;
 116         else
 117                 g_init_state |= HOTPLUG_MGR_INITTED;
 118 
 119         if (init_state_change_manager(config_data) != 0)
 120                 goto cleanup;
 121         else
 122                 g_init_state |= STATE_CHANGE_MGR_INITTED;
 123 
 124         return (E_SUCCESS);
 125 
 126 cleanup:
 127 
 128         unblock_state_change_events();
 129 
 130         /*
 131          * The cleanup order here does matter, due to dependencies between the
 132          * managers.
 133          */
 134         if (g_init_state & HOTPLUG_MGR_INITTED)
 135                 cleanup_hotplug_manager();
 136         if (g_init_state & STATE_CHANGE_MGR_INITTED)
 137                 cleanup_state_change_manager(config_data);
 138         dm_platform_fini();
 139 
 140         return (E_ERROR);
 141 }
 142 
 143 static void
 144 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
 145 {
 146         const char              *action_prop = NULL;
 147         const char              *action_string;
 148 
 149         /*
 150          * The predictive failure action is the activation of the fault
 151          * indicator.
 152          */
 153         if (fmd_nvl_class_match(hdl, nvl,
 154             DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
 155                 action_prop = DISK_PROP_OTEMPACTION;
 156 
 157         if (fmd_nvl_class_match(hdl, nvl,
 158             DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
 159                 action_prop = DISK_PROP_STFAILACTION;
 160 
 161         dm_fault_indicator_set(diskp, INDICATOR_ON);
 162 
 163         if (action_prop != NULL &&
 164             (action_string = dm_prop_lookup(diskp->props, action_prop))
 165             != NULL) {
 166 
 167                 if (dm_platform_indicator_execute(action_string) != 0) {
 168                         log_warn("Fault action `%s' did not successfully "
 169                             "complete.\n", action_string);
 170                 }
 171         }
 172 }
 173 
 174 static void
 175 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair)
 176 {
 177         char            *uuid = NULL;
 178         nvlist_t        **nva;
 179         uint_t          nvc;
 180         diskmon_t       *diskp;
 181         nvlist_t        *fmri;
 182         nvlist_t        *fltnvl;
 183         int             err = 0;
 184 
 185         err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
 186         err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
 187             &nva, &nvc);
 188         if (err != 0)
 189                 return;
 190 
 191         while (nvc-- != 0) {
 192 
 193                 fltnvl = *nva++;
 194 
 195                 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
 196                     != 0)
 197                         continue;
 198 
 199                 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
 200                         continue;
 201 
 202                 log_msg(MM_MAIN, "Disk %s repaired!\n",
 203                     diskp->location);
 204 
 205                 dm_fault_indicator_set(diskp, INDICATOR_OFF);
 206 
 207                 dm_state_change(diskp, HPS_REPAIRED);
 208         }
 209 
 210         if (repair)
 211                 fmd_case_uuresolved(hdl, uuid);
 212 
 213 }
 214 
 215 static void
 216 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
 217 {
 218         char            *uuid = NULL;
 219         nvlist_t        **nva;
 220         uint_t          nvc;
 221         diskmon_t       *diskp;
 222         nvlist_t        *fmri;
 223         nvlist_t        *fltnvl;
 224         int             err = 0;
 225 
 226         err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
 227         err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
 228             &nva, &nvc);
 229         if (err != 0)
 230                 return;
 231 
 232         while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
 233 
 234                 fltnvl = *nva++;
 235 
 236                 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
 237                         continue;
 238 
 239                 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
 240                         continue;
 241 
 242                 /* Execute the actions associated with this fault */
 243                 dm_fault_execute_actions(hdl, diskp,  fltnvl);
 244 
 245                 /*
 246                  * Send a state change event to the state change manager
 247                  */
 248                 dm_state_change(diskp, HPS_FAULTED);
 249         }
 250 
 251         if (!fmd_case_uuclosed(hdl, uuid)) {
 252                 /* Case is closed */
 253                 fmd_case_uuclose(hdl, uuid);
 254         }
 255 }
 256 
 257 /*ARGSUSED*/
 258 static void
 259 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 260 {
 261         diskmon_t       *diskp;
 262         nvlist_t        *fmri;
 263 
 264         if (g_verbose & MM_MAIN)
 265                 nvlist_print(stderr, nvl);
 266 
 267         /*
 268          * Act on the fault suspect list or repaired list (embedded agent
 269          * action).
 270          */
 271         if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) {
 272 
 273                 diskmon_agent_repair(hdl, nvl, 1);
 274                 return;
 275 
 276         } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) {
 277 
 278                 diskmon_agent_repair(hdl, nvl, 0);
 279                 return;
 280 
 281         } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) {
 282 
 283                 diskmon_agent_suspect(hdl, nvl);
 284                 return;
 285         } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) {
 286                 return;
 287         }
 288 
 289         /*
 290          * If we get any replayed faults, set the diskmon's faulted
 291          * flag for the appropriate fault, then change the diskmon's state
 292          * to faulted.
 293          */
 294         if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {
 295 
 296                 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
 297                     &fmri) != 0)
 298                         return;
 299 
 300                 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
 301                         return;
 302 
 303                 /* Execute the actions associated with this fault */
 304                 dm_fault_execute_actions(hdl, diskp, nvl);
 305 
 306                 /*
 307                  * If the fault wasn't generated by this module, send a
 308                  * state change event to the state change manager
 309                  */
 310                 dm_state_change(diskp, HPS_FAULTED);
 311                 return;
 312         }
 313 }
 314 
 315 static const fmd_hdl_ops_t fmd_ops = {
 316         diskmon_recv,   /* fmdo_recv */
 317         NULL,           /* fmdo_timeout */
 318         NULL,           /* fmdo_close */
 319         NULL,           /* fmdo_stats */
 320         NULL,           /* fmdo_gc */
 321 };
 322 
 323 static const fmd_prop_t fmd_props[] = {
 324         { GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
 325         { NULL, 0, NULL }
 326 };
 327 
 328 static const fmd_hdl_info_t fmd_info = {
 329         "Disk Monitor",
 330         DISK_MONITOR_MODULE_VERSION,
 331         &fmd_ops,
 332         fmd_props
 333 };
 334 
 335 void
 336 _fmd_init(fmd_hdl_t *hdl)
 337 {
 338         fmd_case_t      *cp;
 339         int             disk_count;
 340 
 341         g_fm_hdl = hdl;
 342 
 343         if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
 344                 return;
 345         }
 346 
 347         if (config_init()) {
 348                 log_err("Could not initialize configuration!\n");
 349                 fmd_hdl_unregister(hdl);
 350                 return;
 351         }
 352 
 353         if (config_get(hdl, fmd_props)) {
 354                 config_fini();
 355                 log_err("Could not retrieve configuration from libtopo!\n");
 356                 fmd_hdl_unregister(hdl);
 357                 return;
 358         }
 359 
 360         /*
 361          * If there are no disks to monitor, bail out
 362          */
 363         if ((disk_count = count_disks(config_data->disk_list)) == 0) {
 364                 config_fini();
 365                 fmd_hdl_unregister(hdl);
 366                 return;
 367         }
 368 
 369         if (diskmon_init() == E_ERROR) {
 370                 config_fini();
 371                 fmd_hdl_unregister(hdl);
 372                 return;
 373         }
 374 
 375         log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);
 376 
 377         /*
 378          * Iterate over all active cases.
 379          * Since we automatically solve all cases, these cases must have
 380          * had the fault added, but the DE must have been interrupted
 381          * before they were solved.
 382          */
 383         for (cp = fmd_case_next(hdl, NULL);
 384             cp != NULL; cp = fmd_case_next(hdl, cp)) {
 385 
 386                 if (!fmd_case_solved(hdl, cp))
 387                         fmd_case_solve(hdl, cp);
 388         }
 389 }
 390 
 391 /*ARGSUSED*/
 392 void
 393 _fmd_fini(fmd_hdl_t *hdl)
 394 {
 395         diskmon_teardown_all();
 396         g_fm_hdl = NULL;
 397 }