From 35d3e321c44f7044f53a95d8cf7615c8bcf3e5e9 Mon Sep 17 00:00:00 2001
From: Matt McCutchen <matt@mattmccutchen.net>
Date: Wed, 16 Sep 2020 15:44:24 -0400
Subject: [PATCH 1/1] Add web log analysis tools.

---
 web-logs/Makefile       | 139 ++++++++++++++++++++++++++++++++++++++++
 web-logs/group-requests |  50 +++++++++++++++
 2 files changed, 189 insertions(+)
 create mode 100644 web-logs/Makefile
 create mode 100755 web-logs/group-requests

diff --git a/web-logs/Makefile b/web-logs/Makefile
new file mode 100644
index 0000000..563ebd6
--- /dev/null
+++ b/web-logs/Makefile
@@ -0,0 +1,139 @@
+# Matt McCutchen's web log analysis makefile
+# https://mattmccutchen.net/utils/#web-logs
+
+# I do wish for a build tool that rebuilds on command change.  Look for one?  mgear even? ~ 2020-09-16
+
+# Example of what goes in config.mk :
+# MY_SITE_RE := https://mattmccutchen\.net/
+# MY_IP_RE := '^123\.124\.125\.126 '
+include config.mk
+
+# Abbreviation.  Note: A variable reference without parentheses or
+# braces, like $G, always takes a single character as the variable name.
+G := generated
+
+# List the most important targets.  I think they pull in most of the rest.
+default: $G/logs-by-ip $G/requests
+
+clean:
+	rm -f $G/*
+
+.DELETE_ON_ERROR:
+
+# Log line format (with one trailing space!):
+# client_ip remote_logname_not_used http_authenticated_user [date] "request" response_status_code response_body_size "referrer" "user_agent"
+#
+# Standard sed- and (grep -E)-compatible regular expression for matching the fields we care about (client_ip, request, referrer, user_agent) while passing through the others:
+# (Unfortunately, sed doesn't seem to support (?:...) or anything like that.  I don't think it's worth seeking a better tool right now. ~ 2020-09-16)
+
+#log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)"([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$
+log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)" ([0-9]+) ([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$
+# "  # help emacs syntax highlighting
+#
+# Note, the dollar is doubled for make.  Also, this contains no single quotes,
+# which is important because we'll pass it to the shell in single quotes.  No
+# commas either because that's my favorite delimiter for s,,, when the strings
+# may contain slashes (typically due to file paths).
+#
+# Reconstituting the whole line: '\1\2"\3" \5 \6"\7" "\9" ' (remove the single quotes but note the trailing space)
+# Interesting fields:
+# \1: IP
+# \3: request (content escaped but without surrounding quotes so you can manipulate prefixes/suffixes)
+# \5: response status code
+# \7: referrer (ditto \3)
+# \9: user agent (ditto \3)
+
+# Take the current and previous day's logs.  If I look once a day (DreamHost time),
+# this should catch everything with some overlap.  TODO: Avoid the overlap?
+# Requires sth more sophisticated to track what I've already seen.
+#
+# Filename pattern for recent-logs is locked down in pull-logs, so we shouldn't have shell injection here.
+recent_logs := $(shell ls -t recent-logs/access.log* | head -n 2 | tac)
+
+$G/logs: $(recent_logs)
+	cat $^ >$@
+	@grep -vE '$(log_line_regex)' $@; if [ $$? != 1 ]; then echo >&2 'Error: Malformed log lines; please address before continuing.'; false; fi
+
+# A crude measure that will take out anyone else accessing the site
+# from the same public IP address as well. :/
+# But I can't think of an easy, better alternative. ~ 2020-09-16
+# This needs to be a separate step from temp-bot-ip-regexps so that uas doesn't include me.
+$G/logs-not-me: $G/logs
+	<$< grep -v $(MY_IP_RE) >$@
+
+# bot-uas:
+# I tried to be conservative and not add generic client libraries unless
+# it was clear they generally displayed bot-like behavior.  Such libraries so far:
+# axios, Datanyze.
+
+# Quote a basic regular expression for grep.  We only need to escape .[]^$\"
+# because the other metacharacters are behind \ anyway. ~ 2020-09-07
+s_quote_basic_re := s,([.\[\]^$$\\"]),\\\1,g
+# "  # help emacs syntax highlighting
+
+$G/bot-ua-regexps: config/bot-uas
+	<$< sed -re '$(s_quote_basic_re); s,^(.*)$$, "\1" $$,' >$@
+
+# Some dishonest clients show clearly bot-like behavior while using a user-agent
+# I don't want to block because it might catch organic traffic.  Temporarily add
+# their IP addresses to temp-bot-ips after reviewing any apparently organic
+# traffic from those IP addresses.
+$G/temp-bot-ip-regexps: config/temp-bot-ips
+	<$< sed -re '$(s_quote_basic_re); s,^(.*)$$,^\1 ,' >$@
+
+$G/logs-organic: $G/logs-not-me $G/bot-ua-regexps $G/temp-bot-ip-regexps
+	<$< grep -v -f $G/bot-ua-regexps -f $G/temp-bot-ip-regexps >$@
+
+s_unescape_field := s,\\(.),\1,g
+count_and_rank := { sort | uniq -c | sort -k1,1 -r -n; }
+
+# Note: dynamically expanded so we can use it in multiple rules.
+# Also note: the "logs" rule guarantees that all lines match.
+uas_command = <$< sed -re 's,$(log_line_regex),\9,; $(s_unescape_field)' | $(count_and_rank) >$@
+
+# In case I'm curious about the distribution of user agents, including bots, at
+# the request level.  If I want to measure actual server load from bots, I could
+# come a little closer by looking at response size and whether CGIs are invoked.
+$G/uas: $G/logs-not-me
+	$(uas-command)
+
+# Ditto above but for non-bots or identifying new bots.
+$G/uas-organic: $G/logs-organic
+	$(uas-command)
+
+# We don't care about the order of IPs here.
+$G/logs-by-ip: $G/logs-organic
+	<$< sort -k1,1 >$@
+
+irsr_line_regex := ^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$$
+# Reconstituting: \1 "\2" "\4"
+# \1: IP
+# \2: request
+# \4: response status code
+# \5: referrer
+
+# Keep only IP, request (without HTTP version), status code, and referrer.
+# Dedup successive identical lines from the same IP (common case: range requests for same streaming video)
+#
+# Status code is here so I can spot problems that are my fault (e.g., 404) and fix them.
+# Hopefully it doesn't break grouping much.
+#
+# Unfortunately, adding the HTTP version splitting to log_line_regex would exceed sed's limit of 9
+# backreferences, so we use a separate substitution.  We could consider migrating to another tool
+# (https://stackoverflow.com/questions/4318114/circumvent-the-sed-backreference-limit-1-through-9).
+$G/irsr-deduped: $G/logs-by-ip
+	<$< sed -re 's,$(log_line_regex),\1 "\3" \5 "\7",' \
+	    -e 's,^([^ ]+) "(([^"\\]|\\.)*) ([^"\\]|\\.)*" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \5 "\6",' | uniq >$@
+# "  # help emacs syntax highlighting
+
+# Remove a trailing index.html from requests and internal referrers.  On
+# 2020-09-16, I got rid of the internal links to index.html, but there may still
+# be a tail of external links (including bookmarks) and search engines that
+# haven't updated.  I don't think there's any important case in which a trailing
+# index.html is not equivalent to the directory.
+$G/irsr-index-html-snipped: $G/irsr-deduped
+	<$< sed -r -e 's,^([^ ]+) "(([^"\\]|\\.)*/)index\.html" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \4 "\5",' \
+	    -e 's,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "($(MY_SITE_RE)(([^"\\]|\\.)*/)?)index\.html"$$,\1 "\2" \4 "\5",' >$@
+
+$G/requests: $G/irsr-index-html-snipped
+	<$< ./group-requests >$@
diff --git a/web-logs/group-requests b/web-logs/group-requests
new file mode 100755
index 0000000..9af5cd1
--- /dev/null
+++ b/web-logs/group-requests
@@ -0,0 +1,50 @@
+#!/usr/bin/env perl
+# group-requests: Helper for Matt McCutchen's web log analysis process
+# https://mattmccutchen.net/utils/#web-logs
+
+# Input: generated/irsr format
+
+use strict;
+use warnings;
+
+#sub unescape_field($) {
+#    return ($_[0] =~ s,\\(.),$1,g);
+#}
+
+#sub escape_field($) {
+#    return ($_[0] =~ s,([\\"]),\\$1,g);
+#}
+
+my %status_objs = ();
+
+while (<STDIN>) {
+    # irsr_line_regex from Makefile
+    m,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$, or die "Malformed line";
+    my ($request, $status, $referrer) = ($2, $4, $5);
+    #my $rs = "\"$request\" $response_status_code";
+    # Duplicating the key inside the object makes iteration easier later.
+    my $s_obj = ($status_objs{$status} //= {status => $status, req_objs => {}});
+    my $req_obj = ($s_obj->{req_objs}->{$request} //= {request => $request, count => 0, ref_objs => {}});
+    $req_obj->{count}++;
+    my $ref_obj = ($req_obj->{ref_objs}->{$referrer} //= {referrer => $referrer, count => 0});
+    $ref_obj->{count}++;
+}
+
+# Group by status code to make it easy to separate both my-fault and abuse 404s from the bulk of 200s.
+foreach my $s_obj (sort { $a->{status} <=> $b->{status} } values(%status_objs)) {
+    print "=== STATUS CODE $s_obj->{status} ===\n";
+    # Break ties alphabetically to make it a little easier to read.
+    foreach my $req_obj (sort { $b->{count} <=> $a->{count} || $a->{request} cmp $b->{request} } values(%{$s_obj->{req_objs}})) {
+	my @ref_objs = sort { $b->{count} <=> $a->{count} || $a->{referrer} cmp $b->{referrer} } values(%{$req_obj->{ref_objs}});
+	if (scalar(@ref_objs) == 1) {
+	    # Special case to make output a little easier to read.
+	    print sprintf("%4d \"%s\" \"%s\"\n", $req_obj->{count}, $req_obj->{request}, $ref_objs[0]->{referrer});
+	} else {
+	    # That will be the day when we get more than 9999 identical request lines in less than 2 days...
+	    print sprintf("%4d \"%s\"\n", $req_obj->{count}, $req_obj->{request});
+	    foreach my $ref_obj (@ref_objs) {
+		print sprintf("  %4d \"%s\"\n", $ref_obj->{count}, $ref_obj->{referrer});
+	    }
+	}
+    }
+}
-- 
2.34.1