| 1 | #!/usr/bin/env perl |
| 2 | # group-requests: Helper for Matt McCutchen's web log analysis process |
| 3 | # https://mattmccutchen.net/utils/#web-logs |
| 4 | |
| 5 | # Input: generated/irsr format |
| 6 | |
| 7 | use strict; |
| 8 | use warnings; |
| 9 | |
| 10 | #sub unescape_field($) { |
| 11 | # return ($_[0] =~ s,\\(.),$1,g); |
| 12 | #} |
| 13 | |
| 14 | #sub escape_field($) { |
| 15 | # return ($_[0] =~ s,([\\"]),\\$1,g); |
| 16 | #} |
| 17 | |
| 18 | my %status_objs = (); |
| 19 | |
| 20 | while (<STDIN>) { |
| 21 | # irsr_line_regex from Makefile |
| 22 | m,^([^ ]+) "(([^"\\]|\\.)*)" ([0-9]+) "(([^"\\]|\\.)*)"$, or die "Malformed line"; |
| 23 | my ($request, $status, $referrer) = ($2, $4, $5); |
| 24 | #my $rs = "\"$request\" $response_status_code"; |
| 25 | # Duplicating the key inside the object makes iteration easier later. |
| 26 | my $s_obj = ($status_objs{$status} //= {status => $status, req_objs => {}}); |
| 27 | my $req_obj = ($s_obj->{req_objs}->{$request} //= {request => $request, count => 0, ref_objs => {}}); |
| 28 | $req_obj->{count}++; |
| 29 | my $ref_obj = ($req_obj->{ref_objs}->{$referrer} //= {referrer => $referrer, count => 0}); |
| 30 | $ref_obj->{count}++; |
| 31 | } |
| 32 | |
| 33 | # Group by status code to make it easy to separate both my-fault and abuse 404s from the bulk of 200s. |
| 34 | foreach my $s_obj (sort { $a->{status} <=> $b->{status} } values(%status_objs)) { |
| 35 | print "=== STATUS CODE $s_obj->{status} ===\n"; |
| 36 | # Break ties alphabetically to make it a little easier to read. |
| 37 | foreach my $req_obj (sort { $b->{count} <=> $a->{count} || $a->{request} cmp $b->{request} } values(%{$s_obj->{req_objs}})) { |
| 38 | my @ref_objs = sort { $b->{count} <=> $a->{count} || $a->{referrer} cmp $b->{referrer} } values(%{$req_obj->{ref_objs}}); |
| 39 | if (scalar(@ref_objs) == 1) { |
| 40 | # Special case to make output a little easier to read. |
| 41 | print sprintf("%4d \"%s\" \"%s\"\n", $req_obj->{count}, $req_obj->{request}, $ref_objs[0]->{referrer}); |
| 42 | } else { |
| 43 | # That will be the day when we get more than 9999 identical request lines in less than 2 days... |
| 44 | print sprintf("%4d \"%s\"\n", $req_obj->{count}, $req_obj->{request}); |
| 45 | foreach my $ref_obj (@ref_objs) { |
| 46 | print sprintf(" %4d \"%s\"\n", $ref_obj->{count}, $ref_obj->{referrer}); |
| 47 | } |
| 48 | } |
| 49 | } |
| 50 | } |