Commit | Line | Data |
---|---|---|
35d3e321 MM |
1 | # Matt McCutchen's web log analysis makefile |
2 | # https://mattmccutchen.net/utils/#web-logs | |
3 | ||
4 | # I do wish for a build tool that rebuilds on command change. Look for one? mgear even? ~ 2020-09-16 | |
5 | ||
6 | # Example of what goes in config.mk : | |
7 | # MY_SITE_RE := https://mattmccutchen\.net/ | |
8 | # MY_IP_RE := '^123\.124\.125\.126 ' | |
9 | include config.mk | |
10 | ||
11 | # Abbreviation. Note: A variable reference without parentheses or | |
12 | # braces, like $G, always takes a single character as the variable name. | |
13 | G := generated | |
14 | ||
9ed13607 MM |
15 | # I think these targets pull in the rest. ~ 2020-09-17 |
16 | default: $G/logs-by-ip $G/requests $G/uas $G/uas-organic | |
35d3e321 MM |
17 | |
18 | clean: | |
19 | rm -f $G/* | |
20 | ||
21 | .DELETE_ON_ERROR: | |
22 | ||
23 | # Log line format (with one trailing space!): | |
24 | # client_ip remote_logname_not_used http_authenticated_user [date] "request" response_status_code response_body_size "referrer" "user_agent" | |
25 | # | |
26 | # Standard sed- and (grep -E)-compatible regular expression for matching the fields we care about (client_ip, request, referrer, user_agent) while passing through the others: | |
27 | # (Unfortunately, sed doesn't seem to support (?:...) or anything like that. I don't think it's worth seeking a better tool right now. ~ 2020-09-16) | |
28 | ||
29 | #log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)"([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$ | |
30 | log_line_regex := ^([^ ]+)( [^"]*)"(([^"\\]|\\.)*)" ([0-9]+) ([^"]*)"(([^"\\]|\\.)*)" "(([^"\\]|\\.)*)" $$ | |
31 | # " # help emacs syntax highlighting | |
32 | # | |
33 | # Note, the dollar is doubled for make. Also, this contains no single quotes, | |
34 | # which is important because we'll pass it to the shell in single quotes. No | |
35 | # commas either because that's my favorite delimiter for s,,, when the strings | |
36 | # may contain slashes (typically due to file paths). | |
37 | # | |
38 | # Reconstituting the whole line: '\1\2"\3" \5 \6"\7" "\9" ' (remove the single quotes but note the trailing space) | |
39 | # Interesting fields: | |
40 | # \1: IP | |
41 | # \3: request (content escaped but without surrounding quotes so you can manipulate prefixes/suffixes) | |
42 | # \5: response status code | |
43 | # \7: referrer (ditto \3) | |
44 | # \9: user agent (ditto \3) | |
45 | ||
e05b3ea6 MM |
46 | # Take the previous day's logs. If I look once a day (DreamHost time), this |
47 | # should catch everything with no overlap. I previously included the current | |
48 | # day's logs, but I now think the difficulty of maintaining an accurate sense of | |
49 | # usage in the presence of overlap is a greater evil than up to a day of extra | |
50 | # latency. I might find a better solution in the future. | |
35d3e321 MM |
51 | # |
52 | # Filename pattern for recent-logs is locked down in pull-logs, so we shouldn't have shell injection here. | |
e05b3ea6 | 53 | recent_logs := $(shell ls -t recent-logs/access.log* | head -n 2 | tail -n 1 | tac) |
35d3e321 MM |
54 | |
55 | $G/logs: $(recent_logs) | |
56 | cat $^ >$@ | |
57 | @grep -vE '$(log_line_regex)' $@; if [ $$? != 1 ]; then echo >&2 'Error: Malformed log lines; please address before continuing.'; false; fi | |
58 | ||
59 | # A crude measure that will take out anyone else accessing the site | |
60 | # from the same public IP address as well. :/ | |
61 | # But I can't think of an easy, better alternative. ~ 2020-09-16 | |
62 | # This needs to be a separate step from temp-bot-ip-regexps so that uas doesn't include me. | |
63 | $G/logs-not-me: $G/logs | |
64 | <$< grep -v $(MY_IP_RE) >$@ | |
65 | ||
66 | # bot-uas: | |
67 | # I tried to be conservative and not add generic client libraries unless | |
68 | # it was clear they generally displayed bot-like behavior. Such libraries so far: | |
69 | # axios, Datanyze. | |
70 | ||
71 | # Quote a basic regular expression for grep. We only need to escape .[]^$\" | |
72 | # because the other metacharacters are behind \ anyway. ~ 2020-09-07 | |
73 | s_quote_basic_re := s,([.\[\]^$$\\"]),\\\1,g | |
74 | # " # help emacs syntax highlighting | |
75 | ||
76 | $G/bot-ua-regexps: config/bot-uas | |
77 | <$< sed -re '$(s_quote_basic_re); s,^(.*)$$, "\1" $$,' >$@ | |
78 | ||
79 | # Some dishonest clients show clearly bot-like behavior while using a user-agent | |
80 | # I don't want to block because it might catch organic traffic. Temporarily add | |
81 | # their IP addresses to temp-bot-ips after reviewing any apparently organic | |
82 | # traffic from those IP addresses. | |
83 | $G/temp-bot-ip-regexps: config/temp-bot-ips | |
84 | <$< sed -re '$(s_quote_basic_re); s,^(.*)$$,^\1 ,' >$@ | |
85 | ||
86 | $G/logs-organic: $G/logs-not-me $G/bot-ua-regexps $G/temp-bot-ip-regexps | |
87 | <$< grep -v -f $G/bot-ua-regexps -f $G/temp-bot-ip-regexps >$@ | |
88 | ||
89 | s_unescape_field := s,\\(.),\1,g | |
90 | count_and_rank := { sort | uniq -c | sort -k1,1 -r -n; } | |
91 | ||
92 | # Note: dynamically expanded so we can use it in multiple rules. | |
93 | # Also note: the "logs" rule guarantees that all lines match. | |
94 | uas_command = <$< sed -re 's,$(log_line_regex),\9,; $(s_unescape_field)' | $(count_and_rank) >$@ | |
95 | ||
96 | # In case I'm curious about the distribution of user agents, including bots, at | |
97 | # the request level. If I want to measure actual server load from bots, I could | |
98 | # come a little closer by looking at response size and whether CGIs are invoked. | |
99 | $G/uas: $G/logs-not-me | |
9ed13607 | 100 | $(uas_command) |
35d3e321 MM |
101 | |
102 | # Ditto above but for non-bots or identifying new bots. | |
103 | $G/uas-organic: $G/logs-organic | |
9ed13607 | 104 | $(uas_command) |
35d3e321 MM |
105 | |
106 | # We don't care about the order of IPs here. | |
107 | $G/logs-by-ip: $G/logs-organic | |
108 | <$< sort -k1,1 >$@ | |
109 | ||
110 | irsr_line_regex := ^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$$ | |
111 | # Reconstituting: \1 "\2" "\4" | |
112 | # \1: IP | |
113 | # \2: request | |
114 | # \4: response status code | |
115 | # \5: referrer | |
116 | ||
117 | # Keep only IP, request (without HTTP version), status code, and referrer. | |
118 | # Dedup successive identical lines from the same IP (common case: range requests for same streaming video) | |
119 | # | |
120 | # Status code is here so I can spot problems that are my fault (e.g., 404) and fix them. | |
121 | # Hopefully it doesn't break grouping much. | |
122 | # | |
123 | # Unfortunately, adding the HTTP version splitting to log_line_regex would exceed sed's limit of 9 | |
124 | # backreferences, so we use a separate substitution. We could consider migrating to another tool | |
125 | # (https://stackoverflow.com/questions/4318114/circumvent-the-sed-backreference-limit-1-through-9). | |
126 | $G/irsr-deduped: $G/logs-by-ip | |
127 | <$< sed -re 's,$(log_line_regex),\1 "\3" \5 "\7",' \ | |
128 | -e 's,^([^ ]+) "(([^"\\]|\\.)*) ([^"\\]|\\.)*" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \5 "\6",' | uniq >$@ | |
129 | # " # help emacs syntax highlighting | |
130 | ||
131 | # Remove a trailing index.html from requests and internal referrers. On | |
132 | # 2020-09-16, I got rid of the internal links to index.html, but there may still | |
133 | # be a tail of external links (including bookmarks) and search engines that | |
134 | # haven't updated. I don't think there's any important case in which a trailing | |
135 | # index.html is not equivalent to the directory. | |
136 | $G/irsr-index-html-snipped: $G/irsr-deduped | |
137 | <$< sed -r -e 's,^([^ ]+) "(([^"\\]|\\.)*/)index\.html" ([0-9]+) "(([^"\\]|\\.)*)"$$,\1 "\2" \4 "\5",' \ | |
138 | -e 's,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "($(MY_SITE_RE)(([^"\\]|\\.)*/)?)index\.html"$$,\1 "\2" \4 "\5",' >$@ | |
139 | ||
140 | $G/requests: $G/irsr-index-html-snipped | |
141 | <$< ./group-requests >$@ |