Originally posted here:
Spencer - An Email Alert Script for Potentially Hidden Problems | TrueNAS Community
# Spencer is a basic Python script which is designed to run in conjunction with "multi_report" by Joe Schmuck
# https://github.com/JoeSchmuck/Multi-Report
# Spencer checks for additional errors which may appear in your logs that you otherwise may be unaware of.
# The initial version of this script is versioned v1.1 and was written by ChatGPT and NickF
# Importing necessary modules
# subprocess is used for process-related tasks such as running a shell command
# datetime is used to get the current date and time
# socket is used to get the hostname of the system where the script is running
import subprocess
import datetime
import socket
# The hostname of the system where the script is running is retrieved using socket.gethostname()
hostname = socket.gethostname()
# Email related constants: the default recipient and the subject of the email for error and success cases
DEFAULT_RECIPIENT = "[email protected]"
ERROR_SUBJECT = f"[SPENCER] [ERROR] Error Found in Log for {hostname}"
SUCCESS_SUBJECT = f"[SPENCER] [SUCCESS] All is Good! for {hostname}"
print(f"{datetime.datetime.now()} - Spencer is checking log files for errors.")
# This function retrieves the version of the operating system by executing the command "cat /etc/version" in a subprocess.
def get_os_version():
version_command = ["cat", "/etc/version"]
version = subprocess.run(version_command, capture_output=True, text=True).stdout.strip()
return version
# This function searches the log file for specified errors.
# It distinguishes between different operating system versions (SCALE vs CORE) and looks for specific error patterns accordingly.
# This function is responsible for searching the log file for matches and count occurrences
def search_log_file(os_version):
matches = [] # List to store the matched lines
match_counts = { # Dictionary to store the match counts
"ctl_datamove": 0,
"cdb_errors": 0,
"cam_status_error": 0,
"iscsi_timeout": 0
}
# Define the possible error messages for each type of error
cdb_errors = ["read", "write", "verify", "inquiry", "mode sense"]
cam_errors = ["scsi status error", "ata status error", "command timeout", "command aborted"]
cam_ignored = ["command retry", "command complete"]
cdb_ignored = ["inquiry"]
# Open the log file
with open("/var/log/messages", "r") as file:
lines = file.readlines()
# Iterate over each line in the log file
for i, line in enumerate(lines):
line_lower = line.lower() # Convert line to lowercase for case-insensitive matching
# Check if the OS is a SCALE version
if int(os_version[:2]) >= 20 and int(os_version[:2]) <= 30:
# If it's a SCALE version, check for cdb_errors and iscsi_timeout
if "cdb:" in line_lower and any(error in line_lower for error in cdb_errors) and not any(ignore in line_lower for ignore in cdb_ignored):
match = lines[max(0, i - 1) : i + 2] # Capture previous, current, and next line
matches.append("".join(match).strip()) # Append the matching lines to the matches list
match_counts["cdb_errors"] += 1 # Increase the count for cdb_errors
elif "iscsi" in line_lower and ("timeout" in line_lower or "timed out" in line_lower):
match = lines[max(0, i - 1) : i + 2] # Capture previous, current, and next line
matches.append("".join(match).strip()) # Append the matching lines to the matches list
match_counts["iscsi_timeout"] += 1 # Increase the count for iscsi_timeout
else: # If it's not a SCALE version, we assume it's a CORE version
# For a CORE version, check for ctl_datamove and cam_status_error
if "ctl_datamove" in line_lower and "aborted" in line_lower:
match = lines[max(0, i - 1) : i + 2] # Capture previous, current, and next line
matches.append("".join(match).strip()) # Append the matching lines to the matches list
match_counts["ctl_datamove"] += 1 # Increase the count for ctl_datamove
elif "cam status:" in line_lower and any(error in line_lower for error in cam_errors) and not any(ignore in line_lower for ignore in cam_ignored):
match = lines[max(0, i - 1) : i + 2] # Capture previous, current, and next line
matches.append("".join(match).strip()) # Append the matching lines to the matches list
match_counts["cam_status_error"] += 1 # Increase the count for cam_status_error
# Return all matches and their counts
return matches, match_counts
# This function generates a plain text table with the counts of matches, differentiating between CORE and SCALE versions.
def generate_table(match_counts, os_version):
table = ""
if int(os_version[:2]) >= 11 and int(os_version[:2]) <= 21: # CORE versions
table += "Disk Error Type\tCount (CORE)\n"
table += f"cam_status_error\t{match_counts['cam_status_error']}\n"
table += "iSCSI Error Type\tCount (CORE)\n"
table += f"ctl_datamove\t{match_counts['ctl_datamove']}\n"
elif int(os_version[:2]) >= 20 and int(os_version[:2]) <= 30: # SCALE versions
table += "Disk Error Type\tCount (SCALE)\n"
table += f"cdb_errors\t{match_counts['cdb_errors']}\n"
table += "iSCSI Error Type\tCount (SCALE)\n"
table += f"iscsi_timeout\t{match_counts['iscsi_timeout']}\n"
return table
# This function sends an email using the sendmail command.
# The email contains the table of matches and the list of matching lines from the log file.
def send_email(content, to_address, subject):
email_message = f"From: {DEFAULT_RECIPIENT}\nTo: {to_address}\nSubject: {subject}\n\n{content}"
sendmail_command = ["sendmail", "-t", "-oi"]
with subprocess.Popen(sendmail_command, stdin=subprocess.PIPE) as process:
process.communicate(email_message.encode())
print(f"{datetime.datetime.now()} - Spencer sent an email successfully!")
# The recipient email address is set to the default recipient.
to_address = DEFAULT_RECIPIENT
# The OS version is retrieved.
os_version = get_os_version()
# The log file is searched for matches and match counts.
matches, match_counts = search_log_file(os_version)
# A plain text table with match counts is generated.
table_content = generate_table(match_counts, os_version)
# If matches are found, they are appended to the table content.
# If not, a message indicating that no matches were found is appended.
if matches:
content = "{}\n\n{}".format(table_content, '\n\n'.join(matches))
else:
content = f"{table_content}\n\nNo matching lines found in the log file."
# An email is sent with the appropriate subject depending on whether matches were found.
send_email(content, to_address, ERROR_SUBJECT if matches else SUCCESS_SUBJECT)