Print this page
8074 need to add FMA event for SSD wearout
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/cmd/fm/eversholt/files/common/disk.esc
+++ new/usr/src/cmd/fm/eversholt/files/common/disk.esc
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
23 24 */
24 25
25 26 #pragma dictionary "DISK"
26 27
27 28 #define P disk
28 29
29 30 fru P;
30 31 asru P;
31 32
32 33 /*
33 34 * Over all comments for this file:
34 35 * <disk-as-detector> The disk-as-detector DE provides the mapping between
35 36 * ereports generated by a kernel disk driver sd(7D) and resulting faults.
36 37 */
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
37 38
38 39 /*
39 40 * SERD engine for media error fault propagation:
40 41 *
41 42 * This strategy is designed to give a file system, like ZFS, the
42 43 * ability to attempt data recovery/relocation without faulting a disk.
43 44 * This implementation depends on a file system retry to the same lba
44 45 * to trigger a fault when recovery/relocation is not possible.
45 46 *
46 47 * We let the engine propagate one error only once every 1 minute and then if we
47 - * still get 2 or more * errors within 24 hours for the same LBA, there is a fault.
48 + * still get 2 or more errors within 24 hours for the same LBA,
49 + * there is a fault.
48 50 */
49 51 engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
50 52
51 53 /*
52 54 * disk-as-detector: fault events.
53 55 */
54 56 event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
55 57 event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
56 58 engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
57 59
58 60 /*
59 61 * The uderr fault will be defined at some future time.
60 62 * event fault.io.scsi.cmd.disk.dev.uderr@P;
61 63 */
62 64
63 65 /*
64 66 * disk-as-detector: upset events.
65 67 * NOTE: For now we define an upset to implement discard.
66 68 */
67 69 event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
68 70 event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
69 71 event upset.io.scsi.cmd.disk.dev.uderr@P;
70 72 event upset.io.scsi.cmd.disk.dev.serr@P;
71 73 event upset.io.scsi.cmd.disk.tran@P;
72 74 event upset.io.scsi.cmd.disk.recovered@P;
73 75
74 76 /*
75 77 * disk-as-detector: ereports from the kernel.
76 78 *
77 79 * We don't know the topology for all scsi disks, but the kernel will always
78 80 * generate ereport telemetry assuming that we do. We define these ereports
79 81 * with 'discard_if_config_unknown=1', which permits ereports against things
80 82 * with unknown topology to be silently discarded. The ereport data is logged
81 83 * in either case, and can be viewed via 'fmdump -eV'.
82 84 */
83 85 event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
84 86 event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
85 87 event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
86 88 event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
87 89 event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
88 90 event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
89 91
90 92 /*
91 93 * For some ereports we let the 'driver-assessment', communicated as part of
92 94 * the ereport payload, determine fault .vs. upset via propagation constraints.
93 95 */
94 96 #define DRIVER_ASSESSMENT_FATAL \
95 97 (payloadprop_contains("driver-assessment", "fatal"))
96 98 #define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL)
97 99
98 100 /*
99 101 * disk-as-detector: propagations from faults(based on
100 102 * DRIVER_ASSESSMENT_FATAL).
101 103 * We need to set additional fault payloads to indicate fault details.
102 104 * The payload we may need are listed as following:
103 105 * fault.io.scsi.cmd.disk.dev.rqs.derr
104 106 * op_code, key, asc, ascq
105 107 * fault.io.scsi.cmd.disk.dev.rqs.merr
106 108 * op_code, key, asc, ascq, lba
107 109 */
108 110 prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
109 111 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
110 112 setpayloadprop("key", payloadprop("key")) &&
111 113 setpayloadprop("asc", payloadprop("asc")) &&
112 114 setpayloadprop("ascq", payloadprop("ascq"))};
113 115
114 116 /*
115 117 * Utilize setserdsuffix with specific LBA,
116 118 * the serd engine would only trigger if the fault recurred on the same LBA
117 119 */
118 120 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
119 121 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
120 122 setserdsuffix(payloadprop("lba")) &&
121 123 setpayloadprop("key", payloadprop("key")) &&
122 124 setpayloadprop("asc", payloadprop("asc")) &&
123 125 setpayloadprop("ascq", payloadprop("ascq")) &&
124 126 setpayloadprop("lba", payloadprop("lba"))};
125 127
126 128 /*
127 129 * NOTE: this propagation uses the "may" propagation of eversholt.
128 130 * The ereport need never exist. It's just a way of making
129 131 * the diagnosis wait for the within time on that ereport
130 132 * to complete. Once it has completed the diagnosis continues
131 133 * even though the dummy ereport didn't occur.
132 134 */
133 135 event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
134 136 prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
135 137 ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
136 138
137 139 /*
138 140 * The uderr fault will be propagated at some future time.
139 141 * prop fault.io.scsi.cmd.disk.dev.uderr@P->
140 142 * ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
141 143 */
142 144
143 145 /*
144 146 * disk-as-detector: propagations from upsets(based on
145 147 * DRIVER_ASSESSMENT_NONFATAL).
146 148 */
147 149 prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
148 150 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
149 151
150 152 prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
151 153 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
152 154
153 155 /*
154 156 * disk-as-detector: propagations from upsets(independent of
155 157 * driver-assessment)
156 158 */
157 159
158 160 prop upset.io.scsi.cmd.disk.dev.serr@P->
159 161 ereport.io.scsi.cmd.disk.dev.serr@P;
160 162
161 163 prop upset.io.scsi.cmd.disk.dev.uderr@P->
162 164 ereport.io.scsi.cmd.disk.dev.uderr@P;
163 165
164 166 prop upset.io.scsi.cmd.disk.recovered@P->
165 167 ereport.io.scsi.cmd.disk.recovered@P;
166 168
167 169 prop upset.io.scsi.cmd.disk.tran@P->
168 170 ereport.io.scsi.cmd.disk.tran@P;
169 171
170 172 /*
171 173 * --------------------------------------
172 174 * The remainder of this file contains rules associated with the operation of
173 175 * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
174 176 *
175 177 * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
176 178 * generated by the disk-transport fmd module, and the resulting faults.
177 179 */
↓ open down ↓ |
120 lines elided |
↑ open up ↑ |
178 180
179 181 /*
180 182 * Fault events.
181 183 */
182 184 event fault.io.disk.over-temperature@P,
183 185 FITrate=10, FRU=P, ASRU=P;
184 186 event fault.io.disk.predictive-failure@P, FITrate=10,
185 187 FITrate=10, FRU=P, ASRU=P;
186 188 event fault.io.disk.self-test-failure@P, FITrate=10,
187 189 FITrate=10, FRU=P, ASRU=P;
190 +event fault.io.disk.ssm-wearout@P;
188 191
189 192 /*
190 193 * ereports.
191 194 */
192 195 event ereport.io.scsi.disk.over-temperature@P;
193 196 event ereport.io.scsi.disk.predictive-failure@P;
194 197 event ereport.io.scsi.disk.self-test-failure@P;
198 +event ereport.io.scsi.disk.ssm-wearout@P;
195 199
196 200 /*
197 201 * Propagations.
198 202 */
199 203 prop fault.io.disk.over-temperature@P ->
200 204 ereport.io.scsi.disk.over-temperature@P;
201 205
202 206 prop fault.io.disk.self-test-failure@P ->
203 207 ereport.io.scsi.disk.self-test-failure@P;
204 208
205 209 prop fault.io.disk.predictive-failure@P ->
206 210 ereport.io.scsi.disk.predictive-failure@P {
207 211 setpayloadprop("asc", payloadprop("additional-sense-code")) &&
208 212 setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };
213 +
214 +prop fault.io.disk.ssm-wearout@P ->
215 + ereport.io.scsi.disk.ssm-wearout@P {
216 + setpayloadprop("current-wearout-percentage",
217 + payloadprop("current-ssm-wearout"))
218 + && setpayloadprop("threshold-wearout-percentage",
219 + payloadprop("threshold-ssm-wearout")) };
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX