| 1 | # Matt McCutchen's web log analysis makefile |
| 2 | # https://mattmccutchen.net/utils/#web-logs |
| 3 | |
| 4 | # I do wish for a build tool that rebuilds on command change. Look for one? mgear even? ~ 2020-09-16 |
| 5 | |
| 6 | # Example of what goes in config.mk : |
| 7 | # MY_SITE_RE := https://mattmccutchen\.net/ |
| 8 | # MY_IP_RE := '^123\.124\.125\.126 ' |
| 9 | include config.mk |
| 10 | |
| 11 | # Abbreviation. Note: A variable reference without parentheses or |
| 12 | # braces, like $G, always takes a single character as the variable name. |
| 13 | G := generated |
| 14 | |
| 15 | # List the most important targets. I think they pull in most of the rest. |
| 16 | default: $G/logs-by-ip $G/requests |
| 17 | |
| 18 | clean: |
| 19 | rm -f $G/* |
| 20 | |
| 21 | .DELETE_ON_ERROR: |
| 22 | |
| 23 | # Log line format (with one trailing space!): |
| 24 | # client_ip remote_logname_not_used http_authenticated_user [date] "request" response_status_code response_body_size "referrer" "user_agent" |
| 25 | # |
| 26 | # Standard sed- and (grep -E)-compatible regular expression for matching the fields we care about (client_ip, request, referrer, user_agent) while passing through the others: |
| 27 | # (Unfortunately, sed doesn't seem to support (?:...) or anything like that. I don't think it's worth seeking a better tool right now. ~ 2020-09-16) |
| 28 | |
| 29 | #log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)"([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$ |
| 30 | log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)" ([0-9]+) ([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$ |
| 31 | # " # help emacs syntax highlighting |
| 32 | # |
| 33 | # Note, the dollar is doubled for make. Also, this contains no single quotes, |
| 34 | # which is important because we'll pass it to the shell in single quotes. No |
| 35 | # commas either because that's my favorite delimiter for s,,, when the strings |
| 36 | # may contain slashes (typically due to file paths). |
| 37 | # |
| 38 | # Reconstituting the whole line: '\1\2"\3" \5 \6"\7" "\9" ' (remove the single quotes but note the trailing space) |
| 39 | # Interesting fields: |
| 40 | # \1: IP |
| 41 | # \3: request (content escaped but without surrounding quotes so you can manipulate prefixes/suffixes) |
| 42 | # \5: response status code |
| 43 | # \7: referrer (ditto \3) |
| 44 | # \9: user agent (ditto \3) |
| 45 | |
| 46 | # Take the current and previous day's logs. If I look once a day (DreamHost time), |
| 47 | # this should catch everything with some overlap. TODO: Avoid the overlap? |
| 48 | # Requires sth more sophisticated to track what I've already seen. |
| 49 | # |
| 50 | # Filename pattern for recent-logs is locked down in pull-logs, so we shouldn't have shell injection here. |
| 51 | recent_logs := $(shell ls -t recent-logs/access.log* | head -n 2 | tac) |
| 52 | |
| 53 | $G/logs: $(recent_logs) |
| 54 | cat $^ >$@ |
| 55 | @grep -vE '$(log_line_regex)' $@; if [ $$? != 1 ]; then echo >&2 'Error: Malformed log lines; please address before continuing.'; false; fi |
| 56 | |
| 57 | # A crude measure that will take out anyone else accessing the site |
| 58 | # from the same public IP address as well. :/ |
| 59 | # But I can't think of an easy, better alternative. ~ 2020-09-16 |
| 60 | # This needs to be a separate step from temp-bot-ip-regexps so that uas doesn't include me. |
| 61 | $G/logs-not-me: $G/logs |
| 62 | <$< grep -v $(MY_IP_RE) >$@ |
| 63 | |
| 64 | # bot-uas: |
| 65 | # I tried to be conservative and not add generic client libraries unless |
| 66 | # it was clear they generally displayed bot-like behavior. Such libraries so far: |
| 67 | # axios, Datanyze. |
| 68 | |
| 69 | # Quote a basic regular expression for grep. We only need to escape .[]^$\" |
| 70 | # because the other metacharacters are behind \ anyway. ~ 2020-09-07 |
| 71 | s_quote_basic_re := s,([.\[\]^$$\\"]),\\\1,g |
| 72 | # " # help emacs syntax highlighting |
| 73 | |
| 74 | $G/bot-ua-regexps: config/bot-uas |
| 75 | <$< sed -re '$(s_quote_basic_re); s,^(.*)$$, "\1" $$,' >$@ |
| 76 | |
| 77 | # Some dishonest clients show clearly bot-like behavior while using a user-agent |
| 78 | # I don't want to block because it might catch organic traffic. Temporarily add |
| 79 | # their IP addresses to temp-bot-ips after reviewing any apparently organic |
| 80 | # traffic from those IP addresses. |
| 81 | $G/temp-bot-ip-regexps: config/temp-bot-ips |
| 82 | <$< sed -re '$(s_quote_basic_re); s,^(.*)$$,^\1 ,' >$@ |
| 83 | |
| 84 | $G/logs-organic: $G/logs-not-me $G/bot-ua-regexps $G/temp-bot-ip-regexps |
| 85 | <$< grep -v -f $G/bot-ua-regexps -f $G/temp-bot-ip-regexps >$@ |
| 86 | |
| 87 | s_unescape_field := s,\\(.),\1,g |
| 88 | count_and_rank := { sort | uniq -c | sort -k1,1 -r -n; } |
| 89 | |
| 90 | # Note: dynamically expanded so we can use it in multiple rules. |
| 91 | # Also note: the "logs" rule guarantees that all lines match. |
| 92 | uas_command = <$< sed -re 's,$(log_line_regex),\9,; $(s_unescape_field)' | $(count_and_rank) >$@ |
| 93 | |
| 94 | # In case I'm curious about the distribution of user agents, including bots, at |
| 95 | # the request level. If I want to measure actual server load from bots, I could |
| 96 | # come a little closer by looking at response size and whether CGIs are invoked. |
| 97 | $G/uas: $G/logs-not-me |
| 98 | $(uas-command) |
| 99 | |
| 100 | # Ditto above but for non-bots or identifying new bots. |
| 101 | $G/uas-organic: $G/logs-organic |
| 102 | $(uas-command) |
| 103 | |
| 104 | # We don't care about the order of IPs here. |
| 105 | $G/logs-by-ip: $G/logs-organic |
| 106 | <$< sort -k1,1 >$@ |
| 107 | |
| 108 | irsr_line_regex := ^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$$ |
| 109 | # Reconstituting: \1 "\2" "\4" |
| 110 | # \1: IP |
| 111 | # \2: request |
| 112 | # \4: response status code |
| 113 | # \5: referrer |
| 114 | |
| 115 | # Keep only IP, request (without HTTP version), status code, and referrer. |
| 116 | # Dedup successive identical lines from the same IP (common case: range requests for same streaming video) |
| 117 | # |
| 118 | # Status code is here so I can spot problems that are my fault (e.g., 404) and fix them. |
| 119 | # Hopefully it doesn't break grouping much. |
| 120 | # |
| 121 | # Unfortunately, adding the HTTP version splitting to log_line_regex would exceed sed's limit of 9 |
| 122 | # backreferences, so we use a separate substitution. We could consider migrating to another tool |
| 123 | # (https://stackoverflow.com/questions/4318114/circumvent-the-sed-backreference-limit-1-through-9). |
| 124 | $G/irsr-deduped: $G/logs-by-ip |
| 125 | <$< sed -re 's,$(log_line_regex),\1 "\3" \5 "\7",' \ |
| 126 | -e 's,^([^ ]+) "(([^"\\]|\\.)*) ([^"\\]|\\.)*" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \5 "\6",' | uniq >$@ |
| 127 | # " # help emacs syntax highlighting |
| 128 | |
| 129 | # Remove a trailing index.html from requests and internal referrers. On |
| 130 | # 2020-09-16, I got rid of the internal links to index.html, but there may still |
| 131 | # be a tail of external links (including bookmarks) and search engines that |
| 132 | # haven't updated. I don't think there's any important case in which a trailing |
| 133 | # index.html is not equivalent to the directory. |
| 134 | $G/irsr-index-html-snipped: $G/irsr-deduped |
| 135 | <$< sed -r -e 's,^([^ ]+) "(([^"\\]|\\.)*/)index\.html" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \4 "\5",' \ |
| 136 | -e 's,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "($(MY_SITE_RE)(([^"\\]|\\.)*/)?)index\.html"$$,\1 "\2" \4 "\5",' >$@ |
| 137 | |
| 138 | $G/requests: $G/irsr-index-html-snipped |
| 139 | <$< ./group-requests >$@ |