# Matt McCutchen's web log analysis makefile # https://mattmccutchen.net/utils/#web-logs # I do wish for a build tool that rebuilds on command change. Look for one? mgear even? ~ 2020-09-16 # Example of what goes in config.mk : # MY_SITE_RE := https://mattmccutchen\.net/ # MY_IP_RE := '^123\.124\.125\.126 ' include config.mk # Abbreviation. Note: A variable reference without parentheses or # braces, like $G, always takes a single character as the variable name. G := generated # I think these targets pull in the rest. ~ 2020-09-17 default: $G/logs-by-ip $G/requests $G/uas $G/uas-organic clean: rm -f $G/* .DELETE_ON_ERROR: # Log line format (with one trailing space!): # client_ip remote_logname_not_used http_authenticated_user [date] "request" response_status_code response_body_size "referrer" "user_agent" # # Standard sed- and (grep -E)-compatible regular expression for matching the fields we care about (client_ip, request, referrer, user_agent) while passing through the others: # (Unfortunately, sed doesn't seem to support (?:...) or anything like that. I don't think it's worth seeking a better tool right now. ~ 2020-09-16) #log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)"([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$ log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)" ([0-9]+) ([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$ # " # help emacs syntax highlighting # # Note, the dollar is doubled for make. Also, this contains no single quotes, # which is important because we'll pass it to the shell in single quotes. No # commas either because that's my favorite delimiter for s,,, when the strings # may contain slashes (typically due to file paths). # # Reconstituting the whole line: '\1\2"\3" \5 \6"\7" "\9" ' (remove the single quotes but note the trailing space) # Interesting fields: # \1: IP # \3: request (content escaped but without surrounding quotes so you can manipulate prefixes/suffixes) # \5: response status code # \7: referrer (ditto \3) # \9: user agent (ditto \3) # Take the current and previous day's logs. If I look once a day (DreamHost time), # this should catch everything with some overlap. TODO: Avoid the overlap? # Requires sth more sophisticated to track what I've already seen. # # Filename pattern for recent-logs is locked down in pull-logs, so we shouldn't have shell injection here. recent_logs := $(shell ls -t recent-logs/access.log* | head -n 2 | tac) $G/logs: $(recent_logs) cat $^ >$@ @grep -vE '$(log_line_regex)' $@; if [ $$? != 1 ]; then echo >&2 'Error: Malformed log lines; please address before continuing.'; false; fi # A crude measure that will take out anyone else accessing the site # from the same public IP address as well. :/ # But I can't think of an easy, better alternative. ~ 2020-09-16 # This needs to be a separate step from temp-bot-ip-regexps so that uas doesn't include me. $G/logs-not-me: $G/logs <$< grep -v $(MY_IP_RE) >$@ # bot-uas: # I tried to be conservative and not add generic client libraries unless # it was clear they generally displayed bot-like behavior. Such libraries so far: # axios, Datanyze. # Quote a basic regular expression for grep. We only need to escape .[]^$\" # because the other metacharacters are behind \ anyway. ~ 2020-09-07 s_quote_basic_re := s,([.\[\]^$$\\"]),\\\1,g # " # help emacs syntax highlighting $G/bot-ua-regexps: config/bot-uas <$< sed -re '$(s_quote_basic_re); s,^(.*)$$, "\1" $$,' >$@ # Some dishonest clients show clearly bot-like behavior while using a user-agent # I don't want to block because it might catch organic traffic. Temporarily add # their IP addresses to temp-bot-ips after reviewing any apparently organic # traffic from those IP addresses. $G/temp-bot-ip-regexps: config/temp-bot-ips <$< sed -re '$(s_quote_basic_re); s,^(.*)$$,^\1 ,' >$@ $G/logs-organic: $G/logs-not-me $G/bot-ua-regexps $G/temp-bot-ip-regexps <$< grep -v -f $G/bot-ua-regexps -f $G/temp-bot-ip-regexps >$@ s_unescape_field := s,\\(.),\1,g count_and_rank := { sort | uniq -c | sort -k1,1 -r -n; } # Note: dynamically expanded so we can use it in multiple rules. # Also note: the "logs" rule guarantees that all lines match. uas_command = <$< sed -re 's,$(log_line_regex),\9,; $(s_unescape_field)' | $(count_and_rank) >$@ # In case I'm curious about the distribution of user agents, including bots, at # the request level. If I want to measure actual server load from bots, I could # come a little closer by looking at response size and whether CGIs are invoked. $G/uas: $G/logs-not-me $(uas_command) # Ditto above but for non-bots or identifying new bots. $G/uas-organic: $G/logs-organic $(uas_command) # We don't care about the order of IPs here. $G/logs-by-ip: $G/logs-organic <$< sort -k1,1 >$@ irsr_line_regex := ^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$$ # Reconstituting: \1 "\2" "\4" # \1: IP # \2: request # \4: response status code # \5: referrer # Keep only IP, request (without HTTP version), status code, and referrer. # Dedup successive identical lines from the same IP (common case: range requests for same streaming video) # # Status code is here so I can spot problems that are my fault (e.g., 404) and fix them. # Hopefully it doesn't break grouping much. # # Unfortunately, adding the HTTP version splitting to log_line_regex would exceed sed's limit of 9 # backreferences, so we use a separate substitution. We could consider migrating to another tool # (https://stackoverflow.com/questions/4318114/circumvent-the-sed-backreference-limit-1-through-9). $G/irsr-deduped: $G/logs-by-ip <$< sed -re 's,$(log_line_regex),\1 "\3" \5 "\7",' \ -e 's,^([^ ]+) "(([^"\\]|\\.)*) ([^"\\]|\\.)*" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \5 "\6",' | uniq >$@ # " # help emacs syntax highlighting # Remove a trailing index.html from requests and internal referrers. On # 2020-09-16, I got rid of the internal links to index.html, but there may still # be a tail of external links (including bookmarks) and search engines that # haven't updated. I don't think there's any important case in which a trailing # index.html is not equivalent to the directory. $G/irsr-index-html-snipped: $G/irsr-deduped <$< sed -r -e 's,^([^ ]+) "(([^"\\]|\\.)*/)index\.html" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \4 "\5",' \ -e 's,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "($(MY_SITE_RE)(([^"\\]|\\.)*/)?)index\.html"$$,\1 "\2" \4 "\5",' >$@ $G/requests: $G/irsr-index-html-snipped <$< ./group-requests >$@