web-logs/Makefile: Use only the previous day's logs to avoid overlap.
[utils/utils.git] / web-logs / Makefile
CommitLineData
35d3e321
MM
1# Matt McCutchen's web log analysis makefile
2# https://mattmccutchen.net/utils/#web-logs
3
4# I do wish for a build tool that rebuilds on command change. Look for one? mgear even? ~ 2020-09-16
5
6# Example of what goes in config.mk :
7# MY_SITE_RE := https://mattmccutchen\.net/
8# MY_IP_RE := '^123\.124\.125\.126 '
9include config.mk
10
11# Abbreviation. Note: A variable reference without parentheses or
12# braces, like $G, always takes a single character as the variable name.
13G := generated
14
9ed13607
MM
15# I think these targets pull in the rest. ~ 2020-09-17
16default: $G/logs-by-ip $G/requests $G/uas $G/uas-organic
35d3e321
MM
17
18clean:
19 rm -f $G/*
20
21.DELETE_ON_ERROR:
22
23# Log line format (with one trailing space!):
24# client_ip remote_logname_not_used http_authenticated_user [date] "request" response_status_code response_body_size "referrer" "user_agent"
25#
26# Standard sed- and (grep -E)-compatible regular expression for matching the fields we care about (client_ip, request, referrer, user_agent) while passing through the others:
27# (Unfortunately, sed doesn't seem to support (?:...) or anything like that. I don't think it's worth seeking a better tool right now. ~ 2020-09-16)
28
29#log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)"([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$
30log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)" ([0-9]+) ([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$
31# " # help emacs syntax highlighting
32#
33# Note, the dollar is doubled for make. Also, this contains no single quotes,
34# which is important because we'll pass it to the shell in single quotes. No
35# commas either because that's my favorite delimiter for s,,, when the strings
36# may contain slashes (typically due to file paths).
37#
38# Reconstituting the whole line: '\1\2"\3" \5 \6"\7" "\9" ' (remove the single quotes but note the trailing space)
39# Interesting fields:
40# \1: IP
41# \3: request (content escaped but without surrounding quotes so you can manipulate prefixes/suffixes)
42# \5: response status code
43# \7: referrer (ditto \3)
44# \9: user agent (ditto \3)
45
e05b3ea6
MM
46# Take the previous day's logs. If I look once a day (DreamHost time), this
47# should catch everything with no overlap. I previously included the current
48# day's logs, but I now think the difficulty of maintaining an accurate sense of
49# usage in the presence of overlap is a greater evil than up to a day of extra
50# latency. I might find a better solution in the future.
35d3e321
MM
51#
52# Filename pattern for recent-logs is locked down in pull-logs, so we shouldn't have shell injection here.
e05b3ea6 53recent_logs := $(shell ls -t recent-logs/access.log* | head -n 2 | tail -n 1 | tac)
35d3e321
MM
54
55$G/logs: $(recent_logs)
56 cat $^ >$@
57 @grep -vE '$(log_line_regex)' $@; if [ $$? != 1 ]; then echo >&2 'Error: Malformed log lines; please address before continuing.'; false; fi
58
59# A crude measure that will take out anyone else accessing the site
60# from the same public IP address as well. :/
61# But I can't think of an easy, better alternative. ~ 2020-09-16
62# This needs to be a separate step from temp-bot-ip-regexps so that uas doesn't include me.
63$G/logs-not-me: $G/logs
64 <$< grep -v $(MY_IP_RE) >$@
65
66# bot-uas:
67# I tried to be conservative and not add generic client libraries unless
68# it was clear they generally displayed bot-like behavior. Such libraries so far:
69# axios, Datanyze.
70
71# Quote a basic regular expression for grep. We only need to escape .[]^$\"
72# because the other metacharacters are behind \ anyway. ~ 2020-09-07
73s_quote_basic_re := s,([.\[\]^$$\\"]),\\\1,g
74# " # help emacs syntax highlighting
75
76$G/bot-ua-regexps: config/bot-uas
77 <$< sed -re '$(s_quote_basic_re); s,^(.*)$$, "\1" $$,' >$@
78
79# Some dishonest clients show clearly bot-like behavior while using a user-agent
80# I don't want to block because it might catch organic traffic. Temporarily add
81# their IP addresses to temp-bot-ips after reviewing any apparently organic
82# traffic from those IP addresses.
83$G/temp-bot-ip-regexps: config/temp-bot-ips
84 <$< sed -re '$(s_quote_basic_re); s,^(.*)$$,^\1 ,' >$@
85
86$G/logs-organic: $G/logs-not-me $G/bot-ua-regexps $G/temp-bot-ip-regexps
87 <$< grep -v -f $G/bot-ua-regexps -f $G/temp-bot-ip-regexps >$@
88
89s_unescape_field := s,\\(.),\1,g
90count_and_rank := { sort | uniq -c | sort -k1,1 -r -n; }
91
92# Note: dynamically expanded so we can use it in multiple rules.
93# Also note: the "logs" rule guarantees that all lines match.
94uas_command = <$< sed -re 's,$(log_line_regex),\9,; $(s_unescape_field)' | $(count_and_rank) >$@
95
96# In case I'm curious about the distribution of user agents, including bots, at
97# the request level. If I want to measure actual server load from bots, I could
98# come a little closer by looking at response size and whether CGIs are invoked.
99$G/uas: $G/logs-not-me
9ed13607 100 $(uas_command)
35d3e321
MM
101
102# Ditto above but for non-bots or identifying new bots.
103$G/uas-organic: $G/logs-organic
9ed13607 104 $(uas_command)
35d3e321
MM
105
106# We don't care about the order of IPs here.
107$G/logs-by-ip: $G/logs-organic
108 <$< sort -k1,1 >$@
109
110irsr_line_regex := ^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$$
111# Reconstituting: \1 "\2" "\4"
112# \1: IP
113# \2: request
114# \4: response status code
115# \5: referrer
116
117# Keep only IP, request (without HTTP version), status code, and referrer.
118# Dedup successive identical lines from the same IP (common case: range requests for same streaming video)
119#
120# Status code is here so I can spot problems that are my fault (e.g., 404) and fix them.
121# Hopefully it doesn't break grouping much.
122#
123# Unfortunately, adding the HTTP version splitting to log_line_regex would exceed sed's limit of 9
124# backreferences, so we use a separate substitution. We could consider migrating to another tool
125# (https://stackoverflow.com/questions/4318114/circumvent-the-sed-backreference-limit-1-through-9).
126$G/irsr-deduped: $G/logs-by-ip
127 <$< sed -re 's,$(log_line_regex),\1 "\3" \5 "\7",' \
128 -e 's,^([^ ]+) "(([^"\\]|\\.)*) ([^"\\]|\\.)*" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \5 "\6",' | uniq >$@
129# " # help emacs syntax highlighting
130
131# Remove a trailing index.html from requests and internal referrers. On
132# 2020-09-16, I got rid of the internal links to index.html, but there may still
133# be a tail of external links (including bookmarks) and search engines that
134# haven't updated. I don't think there's any important case in which a trailing
135# index.html is not equivalent to the directory.
136$G/irsr-index-html-snipped: $G/irsr-deduped
137 <$< sed -r -e 's,^([^ ]+) "(([^"\\]|\\.)*/)index\.html" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \4 "\5",' \
138 -e 's,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "($(MY_SITE_RE)(([^"\\]|\\.)*/)?)index\.html"$$,\1 "\2" \4 "\5",' >$@
139
140$G/requests: $G/irsr-index-html-snipped
141 <$< ./group-requests >$@