From 35d3e321c44f7044f53a95d8cf7615c8bcf3e5e9 Mon Sep 17 00:00:00 2001 From: Matt McCutchen Date: Wed, 16 Sep 2020 15:44:24 -0400 Subject: [PATCH 1/1] Add web log analysis tools. --- web-logs/Makefile | 139 ++++++++++++++++++++++++++++++++++++++++ web-logs/group-requests | 50 +++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 web-logs/Makefile create mode 100755 web-logs/group-requests diff --git a/web-logs/Makefile b/web-logs/Makefile new file mode 100644 index 0000000..563ebd6 --- /dev/null +++ b/web-logs/Makefile @@ -0,0 +1,139 @@ +# Matt McCutchen's web log analysis makefile +# https://mattmccutchen.net/utils/#web-logs + +# I do wish for a build tool that rebuilds on command change. Look for one? mgear even? ~ 2020-09-16 + +# Example of what goes in config.mk : +# MY_SITE_RE := https://mattmccutchen\.net/ +# MY_IP_RE := '^123\.124\.125\.126 ' +include config.mk + +# Abbreviation. Note: A variable reference without parentheses or +# braces, like $G, always takes a single character as the variable name. +G := generated + +# List the most important targets. I think they pull in most of the rest. +default: $G/logs-by-ip $G/requests + +clean: + rm -f $G/* + +.DELETE_ON_ERROR: + +# Log line format (with one trailing space!): +# client_ip remote_logname_not_used http_authenticated_user [date] "request" response_status_code response_body_size "referrer" "user_agent" +# +# Standard sed- and (grep -E)-compatible regular expression for matching the fields we care about (client_ip, request, referrer, user_agent) while passing through the others: +# (Unfortunately, sed doesn't seem to support (?:...) or anything like that. I don't think it's worth seeking a better tool right now. ~ 2020-09-16) + +#log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)"([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$ +log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)" ([0-9]+) ([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$ +# " # help emacs syntax highlighting +# +# Note, the dollar is doubled for make. Also, this contains no single quotes, +# which is important because we'll pass it to the shell in single quotes. No +# commas either because that's my favorite delimiter for s,,, when the strings +# may contain slashes (typically due to file paths). +# +# Reconstituting the whole line: '\1\2"\3" \5 \6"\7" "\9" ' (remove the single quotes but note the trailing space) +# Interesting fields: +# \1: IP +# \3: request (content escaped but without surrounding quotes so you can manipulate prefixes/suffixes) +# \5: response status code +# \7: referrer (ditto \3) +# \9: user agent (ditto \3) + +# Take the current and previous day's logs. If I look once a day (DreamHost time), +# this should catch everything with some overlap. TODO: Avoid the overlap? +# Requires sth more sophisticated to track what I've already seen. +# +# Filename pattern for recent-logs is locked down in pull-logs, so we shouldn't have shell injection here. +recent_logs := $(shell ls -t recent-logs/access.log* | head -n 2 | tac) + +$G/logs: $(recent_logs) + cat $^ >$@ + @grep -vE '$(log_line_regex)' $@; if [ $$? != 1 ]; then echo >&2 'Error: Malformed log lines; please address before continuing.'; false; fi + +# A crude measure that will take out anyone else accessing the site +# from the same public IP address as well. :/ +# But I can't think of an easy, better alternative. ~ 2020-09-16 +# This needs to be a separate step from temp-bot-ip-regexps so that uas doesn't include me. +$G/logs-not-me: $G/logs + <$< grep -v $(MY_IP_RE) >$@ + +# bot-uas: +# I tried to be conservative and not add generic client libraries unless +# it was clear they generally displayed bot-like behavior. Such libraries so far: +# axios, Datanyze. + +# Quote a basic regular expression for grep. We only need to escape .[]^$\" +# because the other metacharacters are behind \ anyway. ~ 2020-09-07 +s_quote_basic_re := s,([.\[\]^$$\\"]),\\\1,g +# " # help emacs syntax highlighting + +$G/bot-ua-regexps: config/bot-uas + <$< sed -re '$(s_quote_basic_re); s,^(.*)$$, "\1" $$,' >$@ + +# Some dishonest clients show clearly bot-like behavior while using a user-agent +# I don't want to block because it might catch organic traffic. Temporarily add +# their IP addresses to temp-bot-ips after reviewing any apparently organic +# traffic from those IP addresses. +$G/temp-bot-ip-regexps: config/temp-bot-ips + <$< sed -re '$(s_quote_basic_re); s,^(.*)$$,^\1 ,' >$@ + +$G/logs-organic: $G/logs-not-me $G/bot-ua-regexps $G/temp-bot-ip-regexps + <$< grep -v -f $G/bot-ua-regexps -f $G/temp-bot-ip-regexps >$@ + +s_unescape_field := s,\\(.),\1,g +count_and_rank := { sort | uniq -c | sort -k1,1 -r -n; } + +# Note: dynamically expanded so we can use it in multiple rules. +# Also note: the "logs" rule guarantees that all lines match. +uas_command = <$< sed -re 's,$(log_line_regex),\9,; $(s_unescape_field)' | $(count_and_rank) >$@ + +# In case I'm curious about the distribution of user agents, including bots, at +# the request level. If I want to measure actual server load from bots, I could +# come a little closer by looking at response size and whether CGIs are invoked. +$G/uas: $G/logs-not-me + $(uas-command) + +# Ditto above but for non-bots or identifying new bots. +$G/uas-organic: $G/logs-organic + $(uas-command) + +# We don't care about the order of IPs here. +$G/logs-by-ip: $G/logs-organic + <$< sort -k1,1 >$@ + +irsr_line_regex := ^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$$ +# Reconstituting: \1 "\2" "\4" +# \1: IP +# \2: request +# \4: response status code +# \5: referrer + +# Keep only IP, request (without HTTP version), status code, and referrer. +# Dedup successive identical lines from the same IP (common case: range requests for same streaming video) +# +# Status code is here so I can spot problems that are my fault (e.g., 404) and fix them. +# Hopefully it doesn't break grouping much. +# +# Unfortunately, adding the HTTP version splitting to log_line_regex would exceed sed's limit of 9 +# backreferences, so we use a separate substitution. We could consider migrating to another tool +# (https://stackoverflow.com/questions/4318114/circumvent-the-sed-backreference-limit-1-through-9). +$G/irsr-deduped: $G/logs-by-ip + <$< sed -re 's,$(log_line_regex),\1 "\3" \5 "\7",' \ + -e 's,^([^ ]+) "(([^"\\]|\\.)*) ([^"\\]|\\.)*" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \5 "\6",' | uniq >$@ +# " # help emacs syntax highlighting + +# Remove a trailing index.html from requests and internal referrers. On +# 2020-09-16, I got rid of the internal links to index.html, but there may still +# be a tail of external links (including bookmarks) and search engines that +# haven't updated. I don't think there's any important case in which a trailing +# index.html is not equivalent to the directory. +$G/irsr-index-html-snipped: $G/irsr-deduped + <$< sed -r -e 's,^([^ ]+) "(([^"\\]|\\.)*/)index\.html" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \4 "\5",' \ + -e 's,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "($(MY_SITE_RE)(([^"\\]|\\.)*/)?)index\.html"$$,\1 "\2" \4 "\5",' >$@ + +$G/requests: $G/irsr-index-html-snipped + <$< ./group-requests >$@ diff --git a/web-logs/group-requests b/web-logs/group-requests new file mode 100755 index 0000000..9af5cd1 --- /dev/null +++ b/web-logs/group-requests @@ -0,0 +1,50 @@ +#!/usr/bin/env perl +# group-requests: Helper for Matt McCutchen's web log analysis process +# https://mattmccutchen.net/utils/#web-logs + +# Input: generated/irsr format + +use strict; +use warnings; + +#sub unescape_field($) { +# return ($_[0] =~ s,\\(.),$1,g); +#} + +#sub escape_field($) { +# return ($_[0] =~ s,([\\"]),\\$1,g); +#} + +my %status_objs = (); + +while () { + # irsr_line_regex from Makefile + m,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$, or die "Malformed line"; + my ($request, $status, $referrer) = ($2, $4, $5); + #my $rs = "\"$request\" $response_status_code"; + # Duplicating the key inside the object makes iteration easier later. + my $s_obj = ($status_objs{$status} //= {status => $status, req_objs => {}}); + my $req_obj = ($s_obj->{req_objs}->{$request} //= {request => $request, count => 0, ref_objs => {}}); + $req_obj->{count}++; + my $ref_obj = ($req_obj->{ref_objs}->{$referrer} //= {referrer => $referrer, count => 0}); + $ref_obj->{count}++; +} + +# Group by status code to make it easy to separate both my-fault and abuse 404s from the bulk of 200s. +foreach my $s_obj (sort { $a->{status} <=> $b->{status} } values(%status_objs)) { + print "=== STATUS CODE $s_obj->{status} ===\n"; + # Break ties alphabetically to make it a little easier to read. + foreach my $req_obj (sort { $b->{count} <=> $a->{count} || $a->{request} cmp $b->{request} } values(%{$s_obj->{req_objs}})) { + my @ref_objs = sort { $b->{count} <=> $a->{count} || $a->{referrer} cmp $b->{referrer} } values(%{$req_obj->{ref_objs}}); + if (scalar(@ref_objs) == 1) { + # Special case to make output a little easier to read. + print sprintf("%4d \"%s\" \"%s\"\n", $req_obj->{count}, $req_obj->{request}, $ref_objs[0]->{referrer}); + } else { + # That will be the day when we get more than 9999 identical request lines in less than 2 days... + print sprintf("%4d \"%s\"\n", $req_obj->{count}, $req_obj->{request}); + foreach my $ref_obj (@ref_objs) { + print sprintf(" %4d \"%s\"\n", $ref_obj->{count}, $ref_obj->{referrer}); + } + } + } +} -- 2.34.1