TrueNAS Scale Resilvering Issues

So i’m having some issues with my NAS and rebuilding an array (TruNAS) let me start at the beginning.

Setup 8x10TB (half Exos half WD RED PRO)
SAS2008 card
Ryzen 5 + Server Mobo
128 GB ECC DDR4 RAM

  1. I started seeing Checksum errors on one of my drives… tried some scrubs and clears and the numbers started jumping around for the number of errors (61,133,271,45,) at this point i decided to replace the drive

  2. I replaced the drive but the new replacement drive is showing the same Checksum errors and Scrubs are not fixing them…. still jumping around after clears and scrubs (150,61,23)…. tried swapping SAS ports…… issue follows drive

  3. Figured new drive was damaged during shipping, got second replacement WD RED PRO and resilvered the array again…… Checksum errors return and drive errors out ….scrubs don’t fix it and checksum errors continue after clears and repeated scrubs

  4. Figured it was a SAS card issue… replaced with SAS3008 card + new wiring…. wiped drive and did rebuild and still seeing Checksum errors tried scrubs and clearing again but numbers jump around after each scrub (1,61,32)

** Note none of the drives are failing SMART array is going in to degraded state and not failing

I do have all this data backed up but i’m concerned that ZFS is unable to repair this and this is a larger issue with hardware

What do?

I have pulled the logs and all the errors are kinda odd… keeps saying the drive is blank in places where it should have data (not normal… i expect to see bit rot)… Pulled the logs below

Aug 27 2023 17:15:56.924972503 ereport.fs.zfs.checksum
        class = "ereport.fs.zfs.checksum"
        ena = 0x4ebc9aad14000801
        detector = (embedded nvlist)
                version = 0x0
                scheme = "zfs"
                pool = 0x70469b758805927c
                vdev = 0x9953bc42d191f128
        (end detector)
        pool = "storage_pool"
        pool_guid = 0x70469b758805927c
        pool_state = 0x0
        pool_context = 0x0
        pool_failmode = "continue"
        vdev_guid = 0x9953bc42d191f128
        vdev_type = "disk"
        vdev_path = "/dev/disk/by-partuuid/835c0980-932c-4093-95c9-37e1d0fb4478"
        vdev_ashift = 0x9
        vdev_complete_ts = 0x54ebc9a12ba7
        vdev_delta_ts = 0xf973
        vdev_read_errors = 0x0
        vdev_write_errors = 0x0
        vdev_cksum_errors = 0x1
        vdev_delays = 0x0
        parent_guid = 0x22ce662618e98491
        parent_type = "raidz"
        vdev_spare_paths = 
        vdev_spare_guids = 
        zio_err = 0x0
        zio_flags = 0x1008b0
        zio_stage = 0x200000
        zio_pipeline = 0x1f00000
        zio_delay = 0x0
        zio_timestamp = 0x0
        zio_delta = 0x0
        zio_priority = 0x4
        zio_offset = 0x27906359000
        zio_size = 0x1000
        zio_objset = 0x0
        zio_object = 0x0
        zio_level = 0x0
        zio_blkid = 0x0
        bad_ranges = 0x0 0x1d0 0x1e8 0x768 0x780 0x7d8 0x7e8 0x800 
        bad_ranges_min_gap = 0x8
        bad_range_sets = 0x0 0x0 0x0 0x0 
        bad_range_clears = 0xe67 0x2bf4 0x28b 0x85 
        bad_set_histogram = 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 
        bad_cleared_histogram = 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf5 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf5 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf6 0xf5 0xf5 0xf5 0xf5 0xf5 0xf5 0xf5 0xf5 0xf5 0xf5 0xf5 0xf6 0xf6 0xf6 0xf6 0xf6 0xf5 0xf5 0xf5 0xf5 0xf5 0xf5 0xf5 0xf5 
        time = 0x64ebbd0c 0x3721f5d7 
        eid = 0xce

Aug 27 2023 17:38:48.402709403 ereport.fs.zfs.checksum
        class = "ereport.fs.zfs.checksum"
        ena = 0x62b1b18cc4000001
        detector = (embedded nvlist)
                version = 0x0
                scheme = "zfs"
                pool = 0x70469b758805927c
                vdev = 0x9953bc42d191f128
        (end detector)
        pool = "storage_pool"
        pool_guid = 0x70469b758805927c
        pool_state = 0x0
        pool_context = 0x0
        pool_failmode = "continue"
        vdev_guid = 0x9953bc42d191f128
        vdev_type = "disk"
        vdev_path = "/dev/disk/by-partuuid/835c0980-932c-4093-95c9-37e1d0fb4478"
        vdev_ashift = 0x9
        vdev_complete_ts = 0x562b1ae18c7a
        vdev_delta_ts = 0x22f8e12
        vdev_read_errors = 0x0
        vdev_write_errors = 0x0
        vdev_cksum_errors = 0x2
        vdev_delays = 0x0
        parent_guid = 0x22ce662618e98491
        parent_type = "raidz"
        vdev_spare_paths = 
        vdev_spare_guids = 
        zio_err = 0x0
        zio_flags = 0x100880
        zio_stage = 0x200000
        zio_pipeline = 0x1f00000
        zio_delay = 0x0
        zio_timestamp = 0x0
        zio_delta = 0x0
        zio_priority = 0x0
        zio_offset = 0x27a4e4bb000
        zio_size = 0x1000
        zio_objset = 0x71
        zio_object = 0x1118
        zio_level = 0x0
        zio_blkid = 0x0
        bad_ranges = 0x0 0x518 0x530 0x800 
        bad_ranges_min_gap = 0x8
        bad_range_sets = 0x0 0x0 
        bad_range_clears = 0x2891 0x167c 
        bad_set_histogram = 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 
        bad_cleared_histogram = 0xfd 0xfd 0xfd 0xfd 0xfc 0xfc 0xfc 0xfc 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfd 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 0xfc 
        time = 0x64ebc268 0x1800db9b 
        eid = 0xcf

Are you running a backplane or directly attaching the hdds to the HBA?
Does IPMI log any ECC events?

Drives are directly attached to the HBA, and checked the IMPI and no ECC events only clock syncs with NTP…… I also checked dmesg to see if there were any obvious HW errors and not seeing any…… I have also set the HDDS to always on and APM to 254 (Max power) .

I’m going to try a new resilver with the new drive connected to the mobo SATA ports then a scrub to see what happens…… I also updated the FW on the HBA last night to 16.0.12.0 based on the TrueNAS recommendations for SAS 3008…… Maybe this rebuild will fix it

This topic was automatically closed 273 days after the last reply. New replies are no longer allowed.