ssm-wearout Wdiff usr/src/cmd/fm/eversholt/files/common/disk.esc

Print this page

8074 need to add FMA event for SSD wearout

Split	Close
Expand all
Collapse all

          --- old/usr/src/cmd/fm/eversholt/files/common/disk.esc
          +++ new/usr/src/cmd/fm/eversholt/files/common/disk.esc

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  23   24   */
  24   25  
  25   26  #pragma dictionary "DISK"
  26   27  
  27   28  #define P                       disk
  28   29  
  29   30  fru P;
  30   31  asru P;
  31   32  
  32   33  /*

  33   34   * Over all comments for this file:
  34   35   * <disk-as-detector> The disk-as-detector DE provides the mapping between
  35   36   * ereports generated by a kernel disk driver sd(7D) and resulting faults.
  36   37   */

↓ open down ↓

4 lines elided

↑ open up ↑

  37   38  
  38   39  /*
  39   40   * SERD engine for media error fault propagation:
  40   41   *
  41   42   * This strategy is designed to give a file system, like ZFS, the
  42   43   * ability to attempt data recovery/relocation without faulting a disk.
  43   44   * This implementation depends on a file system retry to the same lba
  44   45   * to trigger a fault when recovery/relocation is not possible.
  45   46   *
  46   47   * We let the engine propagate one error only once every 1 minute and then if we
  47      - * still get 2 or more * errors within 24 hours for the same LBA, there is a fault.
       48 + * still get 2 or more errors within 24 hours for the same LBA,
       49 + * there is a fault.
  48   50   */
  49   51  engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
  50   52  
  51   53  /*
  52   54   * disk-as-detector: fault events.
  53   55   */
  54   56  event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
  55   57  event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
  56   58      engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
  57   59

  58   60  /*
  59   61   * The uderr fault will be defined at some future time.
  60   62   * event fault.io.scsi.cmd.disk.dev.uderr@P;
  61   63   */
  62   64  
  63   65  /*
  64   66   * disk-as-detector: upset events.
  65   67   * NOTE: For now we define an upset to implement discard.
  66   68   */
  67   69  event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
  68   70  event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
  69   71  event upset.io.scsi.cmd.disk.dev.uderr@P;
  70   72  event upset.io.scsi.cmd.disk.dev.serr@P;
  71   73  event upset.io.scsi.cmd.disk.tran@P;
  72   74  event upset.io.scsi.cmd.disk.recovered@P;
  73   75  
  74   76  /*
  75   77   * disk-as-detector: ereports from the kernel.
  76   78   *
  77   79   * We don't know the topology for all scsi disks, but the kernel will always
  78   80   * generate ereport telemetry assuming that we do. We define these ereports
  79   81   * with 'discard_if_config_unknown=1', which permits ereports against things
  80   82   * with unknown topology to be silently discarded.  The ereport data is logged
  81   83   * in either case, and can be viewed via 'fmdump -eV'.
  82   84   */
  83   85  event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
  84   86  event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
  85   87  event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
  86   88  event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
  87   89  event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
  88   90  event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
  89   91  
  90   92  /*
  91   93   * For some ereports we let the 'driver-assessment', communicated as part of
  92   94   * the ereport payload, determine fault .vs. upset via propagation constraints.
  93   95   */
  94   96  #define DRIVER_ASSESSMENT_FATAL         \
  95   97              (payloadprop_contains("driver-assessment", "fatal"))
  96   98  #define DRIVER_ASSESSMENT_NONFATAL      (!DRIVER_ASSESSMENT_FATAL)
  97   99  
  98  100  /*
  99  101   * disk-as-detector: propagations from faults(based on
 100  102   * DRIVER_ASSESSMENT_FATAL).
 101  103   * We need to set additional fault payloads to indicate fault details.
 102  104   * The payload we may need are listed as following:
 103  105   * fault.io.scsi.cmd.disk.dev.rqs.derr
 104  106   *     op_code, key, asc, ascq
 105  107   * fault.io.scsi.cmd.disk.dev.rqs.merr
 106  108   *     op_code, key, asc, ascq, lba
 107  109   */
 108  110  prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
 109  111      ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
 110  112      setpayloadprop("key", payloadprop("key")) &&
 111  113      setpayloadprop("asc", payloadprop("asc")) &&
 112  114      setpayloadprop("ascq", payloadprop("ascq"))};
 113  115  
 114  116  /*
 115  117   * Utilize setserdsuffix with specific LBA, 
 116  118   * the serd engine would only trigger if the fault recurred on the same LBA
 117  119   */
 118  120  prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
 119  121      ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
 120  122      setserdsuffix(payloadprop("lba")) &&
 121  123      setpayloadprop("key", payloadprop("key")) &&
 122  124      setpayloadprop("asc", payloadprop("asc")) &&
 123  125      setpayloadprop("ascq", payloadprop("ascq")) &&
 124  126      setpayloadprop("lba", payloadprop("lba"))};
 125  127  
 126  128  /*
 127  129   * NOTE: this propagation uses the "may" propagation of eversholt.
 128  130   * The ereport need never exist. It's just a way of making
 129  131   * the diagnosis wait for the within time on that ereport
 130  132   * to complete. Once it has completed the diagnosis continues
 131  133   * even though the dummy ereport didn't occur.
 132  134   */
 133  135  event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
 134  136  prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
 135  137          ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
 136  138  
 137  139  /*
 138  140   * The uderr fault will be propagated at some future time.
 139  141   * prop fault.io.scsi.cmd.disk.dev.uderr@P->
 140  142   *     ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
 141  143   */
 142  144  
 143  145  /*
 144  146   * disk-as-detector: propagations from upsets(based on
 145  147   * DRIVER_ASSESSMENT_NONFATAL).
 146  148   */
 147  149  prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
 148  150      ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
 149  151  
 150  152  prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
 151  153      ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
 152  154  
 153  155  /*
 154  156   * disk-as-detector: propagations from upsets(independent of
 155  157   * driver-assessment)
 156  158   */
 157  159  
 158  160  prop upset.io.scsi.cmd.disk.dev.serr@P->
 159  161      ereport.io.scsi.cmd.disk.dev.serr@P;
 160  162  
 161  163  prop upset.io.scsi.cmd.disk.dev.uderr@P->
 162  164      ereport.io.scsi.cmd.disk.dev.uderr@P;
 163  165  
 164  166  prop upset.io.scsi.cmd.disk.recovered@P->
 165  167      ereport.io.scsi.cmd.disk.recovered@P;
 166  168  
 167  169  prop upset.io.scsi.cmd.disk.tran@P->
 168  170      ereport.io.scsi.cmd.disk.tran@P;
 169  171  
 170  172  /*
 171  173   * --------------------------------------
 172  174   * The remainder of this file contains rules associated with the operation of
 173  175   * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
 174  176   * 
 175  177   * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
 176  178   * generated by the disk-transport fmd module, and the resulting faults.
 177  179   */

↓ open down ↓

120 lines elided

↑ open up ↑

 178  180  
 179  181  /*
 180  182   * Fault events.
 181  183   */
 182  184  event fault.io.disk.over-temperature@P,
 183  185      FITrate=10, FRU=P, ASRU=P;
 184  186  event fault.io.disk.predictive-failure@P, FITrate=10,
 185  187      FITrate=10, FRU=P, ASRU=P;
 186  188  event fault.io.disk.self-test-failure@P, FITrate=10,
 187  189      FITrate=10, FRU=P, ASRU=P;
      190 +event fault.io.disk.ssm-wearout@P;
 188  191  
 189  192  /*
 190  193   * ereports.
 191  194   */
 192  195  event ereport.io.scsi.disk.over-temperature@P;
 193  196  event ereport.io.scsi.disk.predictive-failure@P;
 194  197  event ereport.io.scsi.disk.self-test-failure@P;
      198 +event ereport.io.scsi.disk.ssm-wearout@P;
 195  199  
 196  200  /*
 197  201   * Propagations.
 198  202   */
 199  203  prop fault.io.disk.over-temperature@P ->
 200  204      ereport.io.scsi.disk.over-temperature@P;
 201  205  
 202  206  prop fault.io.disk.self-test-failure@P ->
 203  207      ereport.io.scsi.disk.self-test-failure@P;
 204  208  
 205  209  prop fault.io.disk.predictive-failure@P ->
 206  210      ereport.io.scsi.disk.predictive-failure@P {
 207  211      setpayloadprop("asc", payloadprop("additional-sense-code")) &&
 208  212      setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };
      213 +
      214 +prop fault.io.disk.ssm-wearout@P ->
      215 +    ereport.io.scsi.disk.ssm-wearout@P {
      216 +    setpayloadprop("current-wearout-percentage",
      217 +    payloadprop("current-ssm-wearout"))
      218 +    && setpayloadprop("threshold-wearout-percentage",
      219 +    payloadprop("threshold-ssm-wearout")) };

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX