1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
24 */
25
26 #pragma dictionary "DISK"
27
28 #define P disk
29
30 fru P;
31 asru P;
32
33 /*
34 * Over all comments for this file:
35 * <disk-as-detector> The disk-as-detector DE provides the mapping between
36 * ereports generated by a kernel disk driver sd(7D) and resulting faults.
37 */
38
39 /*
40 * SERD engine for media error fault propagation:
41 *
42 * This strategy is designed to give a file system, like ZFS, the
43 * ability to attempt data recovery/relocation without faulting a disk.
44 * This implementation depends on a file system retry to the same lba
45 * to trigger a fault when recovery/relocation is not possible.
46 *
47 * We let the engine propagate one error only once every 1 minute and then if we
48 * still get 2 or more errors within 24 hours for the same LBA,
49 * there is a fault.
50 */
51 engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
52
53 /*
54 * disk-as-detector: fault events.
55 */
56 event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
57 event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
58 engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
59
60 /*
61 * The uderr fault will be defined at some future time.
62 * event fault.io.scsi.cmd.disk.dev.uderr@P;
63 */
64
65 /*
66 * disk-as-detector: upset events.
67 * NOTE: For now we define an upset to implement discard.
68 */
69 event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
70 event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
71 event upset.io.scsi.cmd.disk.dev.uderr@P;
72 event upset.io.scsi.cmd.disk.dev.serr@P;
73 event upset.io.scsi.cmd.disk.tran@P;
74 event upset.io.scsi.cmd.disk.recovered@P;
75
76 /*
77 * disk-as-detector: ereports from the kernel.
78 *
79 * We don't know the topology for all scsi disks, but the kernel will always
80 * generate ereport telemetry assuming that we do. We define these ereports
81 * with 'discard_if_config_unknown=1', which permits ereports against things
82 * with unknown topology to be silently discarded. The ereport data is logged
83 * in either case, and can be viewed via 'fmdump -eV'.
84 */
85 event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
86 event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
87 event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
88 event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
89 event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
90 event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
91
92 /*
93 * For some ereports we let the 'driver-assessment', communicated as part of
94 * the ereport payload, determine fault .vs. upset via propagation constraints.
95 */
96 #define DRIVER_ASSESSMENT_FATAL \
97 (payloadprop_contains("driver-assessment", "fatal"))
98 #define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL)
99
100 /*
101 * disk-as-detector: propagations from faults(based on
102 * DRIVER_ASSESSMENT_FATAL).
103 * We need to set additional fault payloads to indicate fault details.
104 * The payload we may need are listed as following:
105 * fault.io.scsi.cmd.disk.dev.rqs.derr
106 * op_code, key, asc, ascq
107 * fault.io.scsi.cmd.disk.dev.rqs.merr
108 * op_code, key, asc, ascq, lba
109 */
110 prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
111 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
112 setpayloadprop("key", payloadprop("key")) &&
113 setpayloadprop("asc", payloadprop("asc")) &&
114 setpayloadprop("ascq", payloadprop("ascq"))};
115
116 /*
117 * Utilize setserdsuffix with specific LBA,
118 * the serd engine would only trigger if the fault recurred on the same LBA
119 */
120 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
121 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
122 setserdsuffix(payloadprop("lba")) &&
123 setpayloadprop("key", payloadprop("key")) &&
124 setpayloadprop("asc", payloadprop("asc")) &&
125 setpayloadprop("ascq", payloadprop("ascq")) &&
126 setpayloadprop("lba", payloadprop("lba"))};
127
128 /*
129 * NOTE: this propagation uses the "may" propagation of eversholt.
130 * The ereport need never exist. It's just a way of making
131 * the diagnosis wait for the within time on that ereport
132 * to complete. Once it has completed the diagnosis continues
133 * even though the dummy ereport didn't occur.
134 */
135 event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
136 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
137 ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
138
139 /*
140 * The uderr fault will be propagated at some future time.
141 * prop fault.io.scsi.cmd.disk.dev.uderr@P->
142 * ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
143 */
144
145 /*
146 * disk-as-detector: propagations from upsets(based on
147 * DRIVER_ASSESSMENT_NONFATAL).
148 */
149 prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
150 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
151
152 prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
153 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
154
155 /*
156 * disk-as-detector: propagations from upsets(independent of
157 * driver-assessment)
158 */
159
160 prop upset.io.scsi.cmd.disk.dev.serr@P->
161 ereport.io.scsi.cmd.disk.dev.serr@P;
162
163 prop upset.io.scsi.cmd.disk.dev.uderr@P->
164 ereport.io.scsi.cmd.disk.dev.uderr@P;
165
166 prop upset.io.scsi.cmd.disk.recovered@P->
167 ereport.io.scsi.cmd.disk.recovered@P;
168
169 prop upset.io.scsi.cmd.disk.tran@P->
170 ereport.io.scsi.cmd.disk.tran@P;
171
172 /*
173 * --------------------------------------
174 * The remainder of this file contains rules associated with the operation of
175 * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
176 *
177 * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
178 * generated by the disk-transport fmd module, and the resulting faults.
179 */
180
181 /*
182 * Fault events.
183 */
184 event fault.io.disk.over-temperature@P,
185 FITrate=10, FRU=P, ASRU=P;
186 event fault.io.disk.predictive-failure@P, FITrate=10,
187 FITrate=10, FRU=P, ASRU=P;
188 event fault.io.disk.self-test-failure@P, FITrate=10,
189 FITrate=10, FRU=P, ASRU=P;
190 event fault.io.disk.ssm-wearout@P;
191
192 /*
193 * ereports.
194 */
195 event ereport.io.scsi.disk.over-temperature@P;
196 event ereport.io.scsi.disk.predictive-failure@P;
197 event ereport.io.scsi.disk.self-test-failure@P;
198 event ereport.io.scsi.disk.ssm-wearout@P;
199
200 /*
201 * Propagations.
202 */
203 prop fault.io.disk.over-temperature@P ->
204 ereport.io.scsi.disk.over-temperature@P;
205
206 prop fault.io.disk.self-test-failure@P ->
207 ereport.io.scsi.disk.self-test-failure@P;
208
209 prop fault.io.disk.predictive-failure@P ->
210 ereport.io.scsi.disk.predictive-failure@P {
211 setpayloadprop("asc", payloadprop("additional-sense-code")) &&
212 setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };
213
214 prop fault.io.disk.ssm-wearout@P ->
215 ereport.io.scsi.disk.ssm-wearout@P {
216 setpayloadprop("current-wearout-percentage",
217 payloadprop("current-ssm-wearout"))
218 && setpayloadprop("threshold-wearout-percentage",
219 payloadprop("threshold-ssm-wearout")) };