1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 /*
  29  * Disk Monitor
  30  */
  31 #include <sys/types.h>
  32 #include <sys/stat.h>
  33 #include <fcntl.h>
  34 #include <time.h>
  35 #include <stdio.h>
  36 #include <stdlib.h>
  37 #include <strings.h>
  38 #include <stdarg.h>
  39 #include <errno.h>
  40 #include <signal.h>
  41 #include <unistd.h>
  42 #include <pthread.h>
  43 #include <libnvpair.h>
  44 #include <fm/fmd_api.h>
  45 #include <fm/fmd_fmri.h>
  46 #include <sys/fm/protocol.h>
  47 #include <sys/fm/io/disk.h>
  48 #include <fm/libtopo.h>
  49 
  50 #include "disk_monitor.h"
  51 #include "hotplug_mgr.h"
  52 #include "schg_mgr.h"
  53 #include "topo_gather.h"
  54 #include "dm_platform.h"
  55 
  56 #define THIS_FMD_MODULE_NAME "disk-monitor"
  57 
  58 static enum disk_init_state {
  59         INIT_STATE_NONE = 0,
  60         STATE_CHANGE_MGR_INITTED = 2,
  61         HOTPLUG_MGR_INITTED = 4
  62 } g_init_state = INIT_STATE_NONE;
  63 
  64 typedef enum {
  65         LT_SUSPECT,
  66         LT_REPAIRED
  67 } fm_list_type_t;
  68 
  69 /*
  70  * Global verbosity flag -- controls chattiness of debug messages and
  71  * warnings.  Its value is determined by the fmd property "log-level"
  72  * settable in the DE's .conf file.
  73  */
  74 log_class_t                     g_verbose = 0;
  75 cfgdata_t                       *config_data = NULL;
  76 fmd_hdl_t                       *g_fm_hdl = NULL;
  77 
  78 static const fmd_prop_t         fmd_props[];
  79 
  80 static void
  81 diskmon_teardown_all(void)
  82 {
  83         cleanup_hotplug_manager();
  84         cleanup_state_change_manager(config_data);
  85         config_fini();
  86 }
  87 
  88 static int
  89 count_disks(diskmon_t *disklistp)
  90 {
  91         int i = 0;
  92 
  93         while (disklistp != NULL) {
  94                 i++;
  95                 disklistp = disklistp->next;
  96         }
  97 
  98         return (i);
  99 }
 100 
 101 static int
 102 diskmon_init(void)
 103 {
 104         /*
 105          * Block the generation of state change events (generated by the
 106          * hotplug manager thread) here; they will be unblocked after the
 107          * state change manager thread is ready to accept state changes
 108          * (shortly after it starts).
 109          */
 110         block_state_change_events();
 111 
 112         if (dm_platform_init() != 0)
 113                 goto cleanup;
 114 
 115         if (init_hotplug_manager() != 0)
 116                 goto cleanup;
 117         else
 118                 g_init_state |= HOTPLUG_MGR_INITTED;
 119 
 120         if (init_state_change_manager(config_data) != 0)
 121                 goto cleanup;
 122         else
 123                 g_init_state |= STATE_CHANGE_MGR_INITTED;
 124 
 125         return (E_SUCCESS);
 126 
 127 cleanup:
 128 
 129         unblock_state_change_events();
 130 
 131         /*
 132          * The cleanup order here does matter, due to dependencies between the
 133          * managers.
 134          */
 135         if (g_init_state & HOTPLUG_MGR_INITTED)
 136                 cleanup_hotplug_manager();
 137         if (g_init_state & STATE_CHANGE_MGR_INITTED)
 138                 cleanup_state_change_manager(config_data);
 139         dm_platform_fini();
 140 
 141         return (E_ERROR);
 142 }
 143 
 144 static void
 145 dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
 146 {
 147         const char              *action_prop = NULL;
 148         const char              *action_string;
 149 
 150         /*
 151          * The predictive failure action is the activation of the fault
 152          * indicator.
 153          */
 154         if (fmd_nvl_class_match(hdl, nvl,
 155             DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
 156                 action_prop = DISK_PROP_OTEMPACTION;
 157 
 158         if (fmd_nvl_class_match(hdl, nvl,
 159             DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
 160                 action_prop = DISK_PROP_STFAILACTION;
 161 
 162         if (fmd_nvl_class_match(hdl, nvl,
 163             DISK_ERROR_CLASS "." FM_FAULT_SSM_WEAROUT))
 164                 action_prop = DISK_PROP_SSMWEAROUTACTION;
 165 
 166         dm_fault_indicator_set(diskp, INDICATOR_ON);
 167 
 168         if (action_prop != NULL &&
 169             (action_string = dm_prop_lookup(diskp->props, action_prop))
 170             != NULL) {
 171 
 172                 if (dm_platform_indicator_execute(action_string) != 0) {
 173                         log_warn("Fault action `%s' did not successfully "
 174                             "complete.\n", action_string);
 175                 }
 176         }
 177 }
 178 
 179 static void
 180 diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair)
 181 {
 182         char            *uuid = NULL;
 183         nvlist_t        **nva;
 184         uint_t          nvc;
 185         diskmon_t       *diskp;
 186         nvlist_t        *fmri;
 187         nvlist_t        *fltnvl;
 188         int             err = 0;
 189 
 190         err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
 191         err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
 192             &nva, &nvc);
 193         if (err != 0)
 194                 return;
 195 
 196         while (nvc-- != 0) {
 197 
 198                 fltnvl = *nva++;
 199 
 200                 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
 201                     != 0)
 202                         continue;
 203 
 204                 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
 205                         continue;
 206 
 207                 log_msg(MM_MAIN, "Disk %s repaired!\n",
 208                     diskp->location);
 209 
 210                 dm_fault_indicator_set(diskp, INDICATOR_OFF);
 211 
 212                 dm_state_change(diskp, HPS_REPAIRED);
 213         }
 214 
 215         if (repair)
 216                 fmd_case_uuresolved(hdl, uuid);
 217 
 218 }
 219 
 220 static void
 221 diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
 222 {
 223         char            *uuid = NULL;
 224         nvlist_t        **nva;
 225         uint_t          nvc;
 226         diskmon_t       *diskp;
 227         nvlist_t        *fmri;
 228         nvlist_t        *fltnvl;
 229         int             err = 0;
 230 
 231         err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
 232         err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
 233             &nva, &nvc);
 234         if (err != 0)
 235                 return;
 236 
 237         while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
 238 
 239                 fltnvl = *nva++;
 240 
 241                 if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
 242                         continue;
 243 
 244                 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
 245                         continue;
 246 
 247                 /* Execute the actions associated with this fault */
 248                 dm_fault_execute_actions(hdl, diskp,  fltnvl);
 249 
 250                 /*
 251                  * Send a state change event to the state change manager
 252                  */
 253                 dm_state_change(diskp, HPS_FAULTED);
 254         }
 255 
 256         if (!fmd_case_uuclosed(hdl, uuid)) {
 257                 /* Case is closed */
 258                 fmd_case_uuclose(hdl, uuid);
 259         }
 260 }
 261 
 262 /*ARGSUSED*/
 263 static void
 264 diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 265 {
 266         diskmon_t       *diskp;
 267         nvlist_t        *fmri;
 268 
 269         if (g_verbose & MM_MAIN)
 270                 nvlist_print(stderr, nvl);
 271 
 272         /*
 273          * Act on the fault suspect list or repaired list (embedded agent
 274          * action).
 275          */
 276         if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) {
 277 
 278                 diskmon_agent_repair(hdl, nvl, 1);
 279                 return;
 280 
 281         } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) {
 282 
 283                 diskmon_agent_repair(hdl, nvl, 0);
 284                 return;
 285 
 286         } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) {
 287 
 288                 diskmon_agent_suspect(hdl, nvl);
 289                 return;
 290         } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) {
 291                 return;
 292         }
 293 
 294         /*
 295          * If we get any replayed faults, set the diskmon's faulted
 296          * flag for the appropriate fault, then change the diskmon's state
 297          * to faulted.
 298          */
 299         if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {
 300 
 301                 if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
 302                     &fmri) != 0)
 303                         return;
 304 
 305                 if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
 306                         return;
 307 
 308                 /* Execute the actions associated with this fault */
 309                 dm_fault_execute_actions(hdl, diskp, nvl);
 310 
 311                 /*
 312                  * If the fault wasn't generated by this module, send a
 313                  * state change event to the state change manager
 314                  */
 315                 dm_state_change(diskp, HPS_FAULTED);
 316                 return;
 317         }
 318 }
 319 
 320 static const fmd_hdl_ops_t fmd_ops = {
 321         diskmon_recv,   /* fmdo_recv */
 322         NULL,           /* fmdo_timeout */
 323         NULL,           /* fmdo_close */
 324         NULL,           /* fmdo_stats */
 325         NULL,           /* fmdo_gc */
 326 };
 327 
 328 static const fmd_prop_t fmd_props[] = {
 329         { GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
 330         { NULL, 0, NULL }
 331 };
 332 
 333 static const fmd_hdl_info_t fmd_info = {
 334         "Disk Monitor",
 335         DISK_MONITOR_MODULE_VERSION,
 336         &fmd_ops,
 337         fmd_props
 338 };
 339 
 340 void
 341 _fmd_init(fmd_hdl_t *hdl)
 342 {
 343         fmd_case_t      *cp;
 344         int             disk_count;
 345 
 346         g_fm_hdl = hdl;
 347 
 348         if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
 349                 return;
 350         }
 351 
 352         if (config_init()) {
 353                 log_err("Could not initialize configuration!\n");
 354                 fmd_hdl_unregister(hdl);
 355                 return;
 356         }
 357 
 358         if (config_get(hdl, fmd_props)) {
 359                 config_fini();
 360                 log_err("Could not retrieve configuration from libtopo!\n");
 361                 fmd_hdl_unregister(hdl);
 362                 return;
 363         }
 364 
 365         /*
 366          * If there are no disks to monitor, bail out
 367          */
 368         if ((disk_count = count_disks(config_data->disk_list)) == 0) {
 369                 config_fini();
 370                 fmd_hdl_unregister(hdl);
 371                 return;
 372         }
 373 
 374         if (diskmon_init() == E_ERROR) {
 375                 config_fini();
 376                 fmd_hdl_unregister(hdl);
 377                 return;
 378         }
 379 
 380         log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);
 381 
 382         /*
 383          * Iterate over all active cases.
 384          * Since we automatically solve all cases, these cases must have
 385          * had the fault added, but the DE must have been interrupted
 386          * before they were solved.
 387          */
 388         for (cp = fmd_case_next(hdl, NULL);
 389             cp != NULL; cp = fmd_case_next(hdl, cp)) {
 390 
 391                 if (!fmd_case_solved(hdl, cp))
 392                         fmd_case_solve(hdl, cp);
 393         }
 394 }
 395 
 396 /*ARGSUSED*/
 397 void
 398 _fmd_fini(fmd_hdl_t *hdl)
 399 {
 400         diskmon_teardown_all();
 401         g_fm_hdl = NULL;
 402 }