web-logs/Makefile

   1 # Matt McCutchen's web log analysis makefile
   2 # https://mattmccutchen.net/utils/#web-logs
   3
   4 # I do wish for a build tool that rebuilds on command change.  Look for one?  mgear even? ~ 2020-09-16
   5
   6 # Example of what goes in config.mk :
   7 # MY_SITE_RE := https://mattmccutchen\.net/
   8 # MY_IP_RE := '^123\.124\.125\.126 '
   9 include config.mk
  10
  11 # Abbreviation.  Note: A variable reference without parentheses or
  12 # braces, like $G, always takes a single character as the variable name.
  13 G := generated
  14
  15 # I think these targets pull in the rest. ~ 2020-09-17
  16 default: $G/logs-by-ip $G/requests $G/uas $G/uas-organic
  17
  18 clean:
  19         rm -f $G/*
  20
  21 .DELETE_ON_ERROR:
  22
  23 # Log line format (with one trailing space!):
  24 # client_ip remote_logname_not_used http_authenticated_user [date] "request" response_status_code response_body_size "referrer" "user_agent"
  25 #
  26 # Standard sed- and (grep -E)-compatible regular expression for matching the fields we care about (client_ip, request, referrer, user_agent) while passing through the others:
  27 # (Unfortunately, sed doesn't seem to support (?:...) or anything like that.  I don't think it's worth seeking a better tool right now. ~ 2020-09-16)
  28
  29 #log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)"([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$
  30 log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)" ([0-9]+) ([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$
  31 # "  # help emacs syntax highlighting
  32 #
  33 # Note, the dollar is doubled for make.  Also, this contains no single quotes,
  34 # which is important because we'll pass it to the shell in single quotes.  No
  35 # commas either because that's my favorite delimiter for s,,, when the strings
  36 # may contain slashes (typically due to file paths).
  37 #
  38 # Reconstituting the whole line: '\1\2"\3" \5 \6"\7" "\9" ' (remove the single quotes but note the trailing space)
  39 # Interesting fields:
  40 # \1: IP
  41 # \3: request (content escaped but without surrounding quotes so you can manipulate prefixes/suffixes)
  42 # \5: response status code
  43 # \7: referrer (ditto \3)
  44 # \9: user agent (ditto \3)
  45
  46 # Take the previous day's logs.  If I look once a day (DreamHost time), this
  47 # should catch everything with no overlap.  I previously included the current
  48 # day's logs, but I now think the difficulty of maintaining an accurate sense of
  49 # usage in the presence of overlap is a greater evil than up to a day of extra
  50 # latency.  I might find a better solution in the future.
  51 #
  52 # Filename pattern for recent-logs is locked down in pull-logs, so we shouldn't have shell injection here.
  53 recent_logs := $(shell ls -t recent-logs/access.log* | head -n 2 | tail -n 1 | tac)
  54
  55 $G/logs: $(recent_logs)
  56         cat $^ >$@
  57         @grep -vE '$(log_line_regex)' $@; if [ $$? != 1 ]; then echo >&2 'Error: Malformed log lines; please address before continuing.'; false; fi
  58
  59 # A crude measure that will take out anyone else accessing the site
  60 # from the same public IP address as well. :/
  61 # But I can't think of an easy, better alternative. ~ 2020-09-16
  62 # This needs to be a separate step from temp-bot-ip-regexps so that uas doesn't include me.
  63 $G/logs-not-me: $G/logs
  64         <$< grep -v $(MY_IP_RE) >$@
  65
  66 # bot-uas:
  67 # I tried to be conservative and not add generic client libraries unless
  68 # it was clear they generally displayed bot-like behavior.  Such libraries so far:
  69 # axios, Datanyze.
  70
  71 # Quote a basic regular expression for grep.  We only need to escape .[]^$\"
  72 # because the other metacharacters are behind \ anyway. ~ 2020-09-07
  73 s_quote_basic_re := s,([.\[\]^$$\\"]),\\\1,g
  74 # "  # help emacs syntax highlighting
  75
  76 $G/bot-ua-regexps: config/bot-uas
  77         <$< sed -re '$(s_quote_basic_re); s,^(.*)$$, "\1" $$,' >$@
  78
  79 # Some dishonest clients show clearly bot-like behavior while using a user-agent
  80 # I don't want to block because it might catch organic traffic.  Temporarily add
  81 # their IP addresses to temp-bot-ips after reviewing any apparently organic
  82 # traffic from those IP addresses.
  83 $G/temp-bot-ip-regexps: config/temp-bot-ips
  84         <$< sed -re '$(s_quote_basic_re); s,^(.*)$$,^\1 ,' >$@
  85
  86 $G/logs-organic: $G/logs-not-me $G/bot-ua-regexps $G/temp-bot-ip-regexps
  87         <$< grep -v -f $G/bot-ua-regexps -f $G/temp-bot-ip-regexps >$@
  88
  89 s_unescape_field := s,\\(.),\1,g
  90 count_and_rank := { sort | uniq -c | sort -k1,1 -r -n; }
  91
  92 # Note: dynamically expanded so we can use it in multiple rules.
  93 # Also note: the "logs" rule guarantees that all lines match.
  94 uas_command = <$< sed -re 's,$(log_line_regex),\9,; $(s_unescape_field)' | $(count_and_rank) >$@
  95
  96 # In case I'm curious about the distribution of user agents, including bots, at
  97 # the request level.  If I want to measure actual server load from bots, I could
  98 # come a little closer by looking at response size and whether CGIs are invoked.
  99 $G/uas: $G/logs-not-me
 100         $(uas_command)
 101
 102 # Ditto above but for non-bots or identifying new bots.
 103 $G/uas-organic: $G/logs-organic
 104         $(uas_command)
 105
 106 # We don't care about the order of IPs here.
 107 $G/logs-by-ip: $G/logs-organic
 108         <$< sort -k1,1 >$@
 109
 110 irsr_line_regex := ^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$$
 111 # Reconstituting: \1 "\2" "\4"
 112 # \1: IP
 113 # \2: request
 114 # \4: response status code
 115 # \5: referrer
 116
 117 # Keep only IP, request (without HTTP version), status code, and referrer.
 118 # Dedup successive identical lines from the same IP (common case: range requests for same streaming video)
 119 #
 120 # Status code is here so I can spot problems that are my fault (e.g., 404) and fix them.
 121 # Hopefully it doesn't break grouping much.
 122 #
 123 # Unfortunately, adding the HTTP version splitting to log_line_regex would exceed sed's limit of 9
 124 # backreferences, so we use a separate substitution.  We could consider migrating to another tool
 125 # (https://stackoverflow.com/questions/4318114/circumvent-the-sed-backreference-limit-1-through-9).
 126 $G/irsr-deduped: $G/logs-by-ip
 127         <$< sed -re 's,$(log_line_regex),\1 "\3" \5 "\7",' \
 128             -e 's,^([^ ]+) "(([^"\\]|\\.)*) ([^"\\]|\\.)*" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \5 "\6",' | uniq >$@
 129 # "  # help emacs syntax highlighting
 130
 131 # Remove a trailing index.html from requests and internal referrers.  On
 132 # 2020-09-16, I got rid of the internal links to index.html, but there may still
 133 # be a tail of external links (including bookmarks) and search engines that
 134 # haven't updated.  I don't think there's any important case in which a trailing
 135 # index.html is not equivalent to the directory.
 136 $G/irsr-index-html-snipped: $G/irsr-deduped
 137         <$< sed -r -e 's,^([^ ]+) "(([^"\\]|\\.)*/)index\.html" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \4 "\5",' \
 138             -e 's,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "($(MY_SITE_RE)(([^"\\]|\\.)*/)?)index\.html"$$,\1 "\2" \4 "\5",' >$@
 139
 140 $G/requests: $G/irsr-index-html-snipped
 141         <$< ./group-requests >$@