[utils/utils.git] / web-logs / Makefile

# Matt McCutchen's web log analysis makefile
# https://mattmccutchen.net/utils/#web-logs

# I do wish for a build tool that rebuilds on command change.  Look for one?  mgear even? ~ 2020-09-16

# Example of what goes in config.mk :
# MY_SITE_RE := https://mattmccutchen\.net/
# MY_IP_RE := '^123\.124\.125\.126 '
include config.mk

# Abbreviation.  Note: A variable reference without parentheses or
# braces, like $G, always takes a single character as the variable name.
G := generated

# I think these targets pull in the rest. ~ 2020-09-17
default: $G/logs-by-ip $G/requests $G/uas $G/uas-organic

clean:
	rm -f $G/*

.DELETE_ON_ERROR:

# Log line format (with one trailing space!):
# client_ip remote_logname_not_used http_authenticated_user [date] "request" response_status_code response_body_size "referrer" "user_agent"
#
# Standard sed- and (grep -E)-compatible regular expression for matching the fields we care about (client_ip, request, referrer, user_agent) while passing through the others:
# (Unfortunately, sed doesn't seem to support (?:...) or anything like that.  I don't think it's worth seeking a better tool right now. ~ 2020-09-16)

#log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)"([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$
log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)" ([0-9]+) ([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$
# "  # help emacs syntax highlighting
#
# Note, the dollar is doubled for make.  Also, this contains no single quotes,
# which is important because we'll pass it to the shell in single quotes.  No
# commas either because that's my favorite delimiter for s,,, when the strings
# may contain slashes (typically due to file paths).
#
# Reconstituting the whole line: '\1\2"\3" \5 \6"\7" "\9" ' (remove the single quotes but note the trailing space)
# Interesting fields:
# \1: IP
# \3: request (content escaped but without surrounding quotes so you can manipulate prefixes/suffixes)
# \5: response status code
# \7: referrer (ditto \3)
# \9: user agent (ditto \3)

# Take the previous day's logs.  If I look once a day (DreamHost time), this
# should catch everything with no overlap.  I previously included the current
# day's logs, but I now think the difficulty of maintaining an accurate sense of
# usage in the presence of overlap is a greater evil than up to a day of extra
# latency.  I might find a better solution in the future.
#
# Filename pattern for recent-logs is locked down in pull-logs, so we shouldn't have shell injection here.
recent_logs := $(shell ls -t recent-logs/access.log* | head -n 2 | tail -n 1 | tac)

$G/logs: $(recent_logs)
	cat $^ >$@
	@grep -vE '$(log_line_regex)' $@; if [ $$? != 1 ]; then echo >&2 'Error: Malformed log lines; please address before continuing.'; false; fi

# A crude measure that will take out anyone else accessing the site
# from the same public IP address as well. :/
# But I can't think of an easy, better alternative. ~ 2020-09-16
# This needs to be a separate step from temp-bot-ip-regexps so that uas doesn't include me.
$G/logs-not-me: $G/logs
	<$< grep -v $(MY_IP_RE) >$@

# bot-uas:
# I tried to be conservative and not add generic client libraries unless
# it was clear they generally displayed bot-like behavior.  Such libraries so far:
# axios, Datanyze.

# Quote a basic regular expression for grep.  We only need to escape .[]^$\"
# because the other metacharacters are behind \ anyway. ~ 2020-09-07
s_quote_basic_re := s,([.\[\]^$$\\"]),\\\1,g
# "  # help emacs syntax highlighting

$G/bot-ua-regexps: config/bot-uas
	<$< sed -re '$(s_quote_basic_re); s,^(.*)$$, "\1" $$,' >$@

# Some dishonest clients show clearly bot-like behavior while using a user-agent
# I don't want to block because it might catch organic traffic.  Temporarily add
# their IP addresses to temp-bot-ips after reviewing any apparently organic
# traffic from those IP addresses.
$G/temp-bot-ip-regexps: config/temp-bot-ips
	<$< sed -re '$(s_quote_basic_re); s,^(.*)$$,^\1 ,' >$@

$G/logs-organic: $G/logs-not-me $G/bot-ua-regexps $G/temp-bot-ip-regexps
	<$< grep -v -f $G/bot-ua-regexps -f $G/temp-bot-ip-regexps >$@

s_unescape_field := s,\\(.),\1,g
count_and_rank := { sort | uniq -c | sort -k1,1 -r -n; }

# Note: dynamically expanded so we can use it in multiple rules.
# Also note: the "logs" rule guarantees that all lines match.
uas_command = <$< sed -re 's,$(log_line_regex),\9,; $(s_unescape_field)' | $(count_and_rank) >$@

# In case I'm curious about the distribution of user agents, including bots, at
# the request level.  If I want to measure actual server load from bots, I could
# come a little closer by looking at response size and whether CGIs are invoked.
$G/uas: $G/logs-not-me
	$(uas_command)

# Ditto above but for non-bots or identifying new bots.
$G/uas-organic: $G/logs-organic
	$(uas_command)

# We don't care about the order of IPs here.
$G/logs-by-ip: $G/logs-organic
	<$< sort -k1,1 >$@

irsr_line_regex := ^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$$
# Reconstituting: \1 "\2" "\4"
# \1: IP
# \2: request
# \4: response status code
# \5: referrer

# Keep only IP, request (without HTTP version), status code, and referrer.
# Dedup successive identical lines from the same IP (common case: range requests for same streaming video)
#
# Status code is here so I can spot problems that are my fault (e.g., 404) and fix them.
# Hopefully it doesn't break grouping much.
#
# Unfortunately, adding the HTTP version splitting to log_line_regex would exceed sed's limit of 9
# backreferences, so we use a separate substitution.  We could consider migrating to another tool
# (https://stackoverflow.com/questions/4318114/circumvent-the-sed-backreference-limit-1-through-9).
$G/irsr-deduped: $G/logs-by-ip
	<$< sed -re 's,$(log_line_regex),\1 "\3" \5 "\7",' \
	    -e 's,^([^ ]+) "(([^"\\]|\\.)*) ([^"\\]|\\.)*" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \5 "\6",' | uniq >$@
# "  # help emacs syntax highlighting

# Remove a trailing index.html from requests and internal referrers.  On
# 2020-09-16, I got rid of the internal links to index.html, but there may still
# be a tail of external links (including bookmarks) and search engines that
# haven't updated.  I don't think there's any important case in which a trailing
# index.html is not equivalent to the directory.
$G/irsr-index-html-snipped: $G/irsr-deduped
	<$< sed -r -e 's,^([^ ]+) "(([^"\\]|\\.)*/)index\.html" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \4 "\5",' \
	    -e 's,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "($(MY_SITE_RE)(([^"\\]|\\.)*/)?)index\.html"$$,\1 "\2" \4 "\5",' >$@

$G/requests: $G/irsr-index-html-snipped
	<$< ./group-requests >$@
Commit	Line	Data
35d3e321 MM	1	# Matt McCutchen's web log analysis makefile
	2	# https://mattmccutchen.net/utils/#web-logs
	3
	4	# I do wish for a build tool that rebuilds on command change. Look for one? mgear even? ~ 2020-09-16
	5
	6	# Example of what goes in config.mk :
	7	# MY_SITE_RE := https://mattmccutchen\.net/
	8	# MY_IP_RE := '^123\.124\.125\.126 '
	9	include config.mk
	10
	11	# Abbreviation. Note: A variable reference without parentheses or
	12	# braces, like $G, always takes a single character as the variable name.
	13	G := generated
	14
9ed13607 MM	15	# I think these targets pull in the rest. ~ 2020-09-17
9ed13607 MM	16	default: $G/logs-by-ip $G/requests $G/uas $G/uas-organic
35d3e321 MM	17
	18	clean:
	19	rm -f $G/*
	20
	21	.DELETE_ON_ERROR:
	22
	23	# Log line format (with one trailing space!):
	24	# client_ip remote_logname_not_used http_authenticated_user [date] "request" response_status_code response_body_size "referrer" "user_agent"
	25	#
	26	# Standard sed- and (grep -E)-compatible regular expression for matching the fields we care about (client_ip, request, referrer, user_agent) while passing through the others:
	27	# (Unfortunately, sed doesn't seem to support (?:...) or anything like that. I don't think it's worth seeking a better tool right now. ~ 2020-09-16)
	28
	29	#log_line_regex := ^([^ ]+)( [^"])"(([^"\\]\|\\.))"([^"])"(([^"\\]\|\\.))" "(([^"\\]\|\\.)*)" $$
	30	log_line_regex := ^([^ ]+)( [^"])"(([^"\\]\|\\.))" ([0-9]+) ([^"])"(([^"\\]\|\\.))" "(([^"\\]\|\\.)*)" $$
	31	# " # help emacs syntax highlighting
	32	#
	33	# Note, the dollar is doubled for make. Also, this contains no single quotes,
	34	# which is important because we'll pass it to the shell in single quotes. No
	35	# commas either because that's my favorite delimiter for s,,, when the strings
	36	# may contain slashes (typically due to file paths).
	37	#
	38	# Reconstituting the whole line: '\1\2"\3" \5 \6"\7" "\9" ' (remove the single quotes but note the trailing space)
	39	# Interesting fields:
	40	# \1: IP
	41	# \3: request (content escaped but without surrounding quotes so you can manipulate prefixes/suffixes)
	42	# \5: response status code
	43	# \7: referrer (ditto \3)
	44	# \9: user agent (ditto \3)
	45
e05b3ea6 MM	46	# Take the previous day's logs. If I look once a day (DreamHost time), this
	47	# should catch everything with no overlap. I previously included the current
	48	# day's logs, but I now think the difficulty of maintaining an accurate sense of
	49	# usage in the presence of overlap is a greater evil than up to a day of extra
	50	# latency. I might find a better solution in the future.
35d3e321 MM	51	#
35d3e321 MM	52	# Filename pattern for recent-logs is locked down in pull-logs, so we shouldn't have shell injection here.
e05b3ea6	53	recent_logs := $(shell ls -t recent-logs/access.log* \| head -n 2 \| tail -n 1 \| tac)
35d3e321 MM	54
	55	$G/logs: $(recent_logs)
	56	cat $^ >$@
	57	@grep -vE '$(log_line_regex)' $@; if [ $$? != 1 ]; then echo >&2 'Error: Malformed log lines; please address before continuing.'; false; fi
	58
	59	# A crude measure that will take out anyone else accessing the site
	60	# from the same public IP address as well. :/
	61	# But I can't think of an easy, better alternative. ~ 2020-09-16
	62	# This needs to be a separate step from temp-bot-ip-regexps so that uas doesn't include me.
	63	$G/logs-not-me: $G/logs
	64	<$< grep -v $(MY_IP_RE) >$@
	65
	66	# bot-uas:
	67	# I tried to be conservative and not add generic client libraries unless
	68	# it was clear they generally displayed bot-like behavior. Such libraries so far:
	69	# axios, Datanyze.
	70
	71	# Quote a basic regular expression for grep. We only need to escape .[]^$\"
	72	# because the other metacharacters are behind \ anyway. ~ 2020-09-07
	73	s_quote_basic_re := s,([.\[\]^$$\\"]),\\\1,g
	74	# " # help emacs syntax highlighting
	75
	76	$G/bot-ua-regexps: config/bot-uas
	77	<$< sed -re '$(s_quote_basic_re); s,^(.*)$$, "\1" $$,' >$@
	78
	79	# Some dishonest clients show clearly bot-like behavior while using a user-agent
	80	# I don't want to block because it might catch organic traffic. Temporarily add
	81	# their IP addresses to temp-bot-ips after reviewing any apparently organic
	82	# traffic from those IP addresses.
	83	$G/temp-bot-ip-regexps: config/temp-bot-ips
	84	<$< sed -re '$(s_quote_basic_re); s,^(.*)$$,^\1 ,' >$@
	85
	86	$G/logs-organic: $G/logs-not-me $G/bot-ua-regexps $G/temp-bot-ip-regexps
	87	<$< grep -v -f $G/bot-ua-regexps -f $G/temp-bot-ip-regexps >$@
	88
	89	s_unescape_field := s,\\(.),\1,g
	90	count_and_rank := { sort \| uniq -c \| sort -k1,1 -r -n; }
	91
	92	# Note: dynamically expanded so we can use it in multiple rules.
	93	# Also note: the "logs" rule guarantees that all lines match.
	94	uas_command = <$< sed -re 's,$(log_line_regex),\9,; $(s_unescape_field)' \| $(count_and_rank) >$@
	95
	96	# In case I'm curious about the distribution of user agents, including bots, at
	97	# the request level. If I want to measure actual server load from bots, I could
	98	# come a little closer by looking at response size and whether CGIs are invoked.
	99	$G/uas: $G/logs-not-me
9ed13607	100	$(uas_command)
35d3e321 MM	101
	102	# Ditto above but for non-bots or identifying new bots.
	103	$G/uas-organic: $G/logs-organic
9ed13607	104	$(uas_command)
35d3e321 MM	105
	106	# We don't care about the order of IPs here.
	107	$G/logs-by-ip: $G/logs-organic
	108	<$< sort -k1,1 >$@
	109
	110	irsr_line_regex := ^([^ ]+) "(([^"\\]\|\\.))" ([0-9]+) "(([^"\\]\|\\.))"$$
	111	# Reconstituting: \1 "\2" "\4"
	112	# \1: IP
	113	# \2: request
	114	# \4: response status code
	115	# \5: referrer
	116
	117	# Keep only IP, request (without HTTP version), status code, and referrer.
	118	# Dedup successive identical lines from the same IP (common case: range requests for same streaming video)
	119	#
	120	# Status code is here so I can spot problems that are my fault (e.g., 404) and fix them.
	121	# Hopefully it doesn't break grouping much.
	122	#
	123	# Unfortunately, adding the HTTP version splitting to log_line_regex would exceed sed's limit of 9
	124	# backreferences, so we use a separate substitution. We could consider migrating to another tool
	125	# (https://stackoverflow.com/questions/4318114/circumvent-the-sed-backreference-limit-1-through-9).
	126	$G/irsr-deduped: $G/logs-by-ip
	127	<$< sed -re 's,$(log_line_regex),\1 "\3" \5 "\7",' \
	128	-e 's,^([^ ]+) "(([^"\\]\|\\.)) ([^"\\]\|\\.)" ([0-9]+) "(([^"\\]\|\\.)*)"$$,\1 "\2" \5 "\6",' \| uniq >$@
	129	# " # help emacs syntax highlighting
	130
	131	# Remove a trailing index.html from requests and internal referrers. On
	132	# 2020-09-16, I got rid of the internal links to index.html, but there may still
	133	# be a tail of external links (including bookmarks) and search engines that
	134	# haven't updated. I don't think there's any important case in which a trailing
	135	# index.html is not equivalent to the directory.
	136	$G/irsr-index-html-snipped: $G/irsr-deduped
	137	<$< sed -r -e 's,^([^ ]+) "(([^"\\]\|\\.)/)index\.html" ([0-9]+) "(([^"\\]\|\\.))"$$,\1 "\2" \4 "\5",' \
	138	-e 's,^([^ ]+) "(([^"\\]\|\\.))" ([0-9]+) "($(MY_SITE_RE)(([^"\\]\|\\.)/)?)index\.html"$$,\1 "\2" \4 "\5",' >$@
	139
	140	$G/requests: $G/irsr-index-html-snipped
	141	<$< ./group-requests >$@