#!/bin/bash

# Function to extract status, category, and reputation from the log files created with url_categorization.sh
make_request() {
    url=$1
    url_normalized=$(echo "$1" | sed 's/https:\/\///')
    logfile="data/$url_normalized.log"

status=$(cat "$logfile" | grep "bgcolor=" | grep -oP '(Uncategorized URL|Categorized URL)')
# Extract category. If there is more the one category, take the first one
category=$(cat "$logfile" | grep "bgcolor=" | grep -oP '(?<=- )[^<]+(?=<br />)' | head -n 1)
: "${category:=unknown}" # if category is empty, set it to "unknown"
reputation=$(cat "$logfile" | grep "bgcolor=" | sed -E "s/.*nowrap\">(.*)<\/td><\/tr>/\\1/")
echo "$url,$status,$category,$reputation"

}

# Check if the filename is provided as an argument
if [ "$#" -ne 1 ]; then
	echo "Extract status, website category, and reputation from log files created with url_categorization.sh. Results are given as csv (structure: url,status,category,reputation)."
    echo "Usage: $0 <file with list of urls>"
    exit 1
fi

# Input file containing URLs
input_file="$1"

# Read file line by line
while IFS= read -r line; do
    # Check if line contains a URL
    if [[ $line =~ ^https?:// ]]; then
        # Make request and filter response
        make_request "$line"
    else
        echo "Skipping non HTTPS line: $line"
    fi
done < "$input_file"

