Spencer - TrueNAS Alert Script for Potentially Hidden Problems

Originally posted here:
Spencer - An Email Alert Script for Potentially Hidden Problems | TrueNAS Community

# Spencer is a basic Python script which is designed to run in conjunction with "multi_report" by Joe Schmuck
# https://github.com/JoeSchmuck/Multi-Report
# Spencer checks for additional errors which may appear in your logs that you otherwise may be unaware of.
# The initial version of this script is versioned v1.1 and was written by ChatGPT and NickF

# Importing necessary modules
# subprocess is used for process-related tasks such as running a shell command
# datetime is used to get the current date and time
# socket is used to get the hostname of the system where the script is running
import subprocess
import datetime
import socket

# The hostname of the system where the script is running is retrieved using socket.gethostname()
hostname = socket.gethostname()

# Email related constants: the default recipient and the subject of the email for error and success cases
DEFAULT_RECIPIENT = "[email protected]"
ERROR_SUBJECT = f"[SPENCER] [ERROR] Error Found in Log for {hostname}"
SUCCESS_SUBJECT = f"[SPENCER] [SUCCESS] All is Good! for {hostname}"

print(f"{datetime.datetime.now()} - Spencer is checking log files for errors.")

# This function retrieves the version of the operating system by executing the command "cat /etc/version" in a subprocess.
def get_os_version():
    version_command = ["cat", "/etc/version"]
    version = subprocess.run(version_command, capture_output=True, text=True).stdout.strip()
    return version

# This function searches the log file for specified errors.
# It distinguishes between different operating system versions (SCALE vs CORE) and looks for specific error patterns accordingly.
# This function is responsible for searching the log file for matches and count occurrences
def search_log_file(os_version):

    matches = []  # List to store the matched lines
    match_counts = {  # Dictionary to store the match counts
        "ctl_datamove": 0,
        "cdb_errors": 0,
        "cam_status_error": 0,
        "iscsi_timeout": 0
    }

    # Define the possible error messages for each type of error
    cdb_errors = ["read", "write", "verify", "inquiry", "mode sense"]
    cam_errors = ["scsi status error", "ata status error", "command timeout", "command aborted"]
    cam_ignored = ["command retry", "command complete"]
    cdb_ignored = ["inquiry"]

    # Open the log file
    with open("/var/log/messages", "r") as file:
        lines = file.readlines()

    # Iterate over each line in the log file
    for i, line in enumerate(lines):
        line_lower = line.lower()  # Convert line to lowercase for case-insensitive matching

        # Check if the OS is a SCALE version
        if int(os_version[:2]) >= 20 and int(os_version[:2]) <= 30:
            # If it's a SCALE version, check for cdb_errors and iscsi_timeout

            if "cdb:" in line_lower and any(error in line_lower for error in cdb_errors) and not any(ignore in line_lower for ignore in cdb_ignored):
                match = lines[max(0, i - 1) : i + 2]  # Capture previous, current, and next line
                matches.append("".join(match).strip())  # Append the matching lines to the matches list
                match_counts["cdb_errors"] += 1  # Increase the count for cdb_errors

            elif "iscsi" in line_lower and ("timeout" in line_lower or "timed out" in line_lower):
                match = lines[max(0, i - 1) : i + 2]  # Capture previous, current, and next line
                matches.append("".join(match).strip())  # Append the matching lines to the matches list
                match_counts["iscsi_timeout"] += 1  # Increase the count for iscsi_timeout

        else:  # If it's not a SCALE version, we assume it's a CORE version
            # For a CORE version, check for ctl_datamove and cam_status_error

            if "ctl_datamove" in line_lower and "aborted" in line_lower:
                match = lines[max(0, i - 1) : i + 2]  # Capture previous, current, and next line
                matches.append("".join(match).strip())  # Append the matching lines to the matches list
                match_counts["ctl_datamove"] += 1  # Increase the count for ctl_datamove

            elif "cam status:" in line_lower and any(error in line_lower for error in cam_errors) and not any(ignore in line_lower for ignore in cam_ignored):
                match = lines[max(0, i - 1) : i + 2]  # Capture previous, current, and next line
                matches.append("".join(match).strip())  # Append the matching lines to the matches list
                match_counts["cam_status_error"] += 1  # Increase the count for cam_status_error

    # Return all matches and their counts
    return matches, match_counts

# This function generates a plain text table with the counts of matches, differentiating between CORE and SCALE versions.
def generate_table(match_counts, os_version):
    table = ""
    if int(os_version[:2]) >= 11 and int(os_version[:2]) <= 21:  # CORE versions
        table += "Disk Error Type\tCount (CORE)\n"
        table += f"cam_status_error\t{match_counts['cam_status_error']}\n"
        table += "iSCSI Error Type\tCount (CORE)\n"
        table += f"ctl_datamove\t{match_counts['ctl_datamove']}\n"
    elif int(os_version[:2]) >= 20 and int(os_version[:2]) <= 30:  # SCALE versions
        table += "Disk Error Type\tCount (SCALE)\n"
        table += f"cdb_errors\t{match_counts['cdb_errors']}\n"
        table += "iSCSI Error Type\tCount (SCALE)\n"
        table += f"iscsi_timeout\t{match_counts['iscsi_timeout']}\n"
    return table

# This function sends an email using the sendmail command.
# The email contains the table of matches and the list of matching lines from the log file.
def send_email(content, to_address, subject):
    email_message = f"From: {DEFAULT_RECIPIENT}\nTo: {to_address}\nSubject: {subject}\n\n{content}"

    sendmail_command = ["sendmail", "-t", "-oi"]
    with subprocess.Popen(sendmail_command, stdin=subprocess.PIPE) as process:
        process.communicate(email_message.encode())

    print(f"{datetime.datetime.now()} - Spencer sent an email successfully!")

# The recipient email address is set to the default recipient.
to_address = DEFAULT_RECIPIENT

# The OS version is retrieved.
os_version = get_os_version()

# The log file is searched for matches and match counts.
matches, match_counts = search_log_file(os_version)

# A plain text table with match counts is generated.
table_content = generate_table(match_counts, os_version)

# If matches are found, they are appended to the table content.
# If not, a message indicating that no matches were found is appended.
if matches:
    content = "{}\n\n{}".format(table_content, '\n\n'.join(matches))
else:
    content = f"{table_content}\n\nNo matching lines found in the log file."

# An email is sent with the appropriate subject depending on whether matches were found.
send_email(content, to_address, ERROR_SUBJECT if matches else SUCCESS_SUBJECT)
 
1 Like