#!/usr/bin/perl -w # # Lintian HTML reporting tool -- Create Lintian web reports # # Copyright (C) 1998 Christian Schwarz and Richard Braakman # Copyright (C) 2007 Russ Allbery # Copyright (C) 2017-2019 Chris Lamb # # This program is free software. It is distributed under the terms of # the GNU General Public License as published by the Free Software # Foundation; either version 2 of the License, or (at your option) any # later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, you can find it on the World Wide # Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free # Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301, USA. package reporting_html_reports; use v5.20; use warnings; use utf8; use autodie; use Getopt::Long; use POSIX qw(strftime); use File::Copy qw(copy); use Fcntl qw(SEEK_SET); use List::Util qw(first); use List::MoreUtils qw(uniq); use Path::Tiny; use Text::Template (); use URI::Escape; use YAML::XS (); use Lintian::Data; use Lintian::Deb822::Parser qw(read_dpkg_control_lc); use Lintian::IO::Async qw(safe_qx); use Lintian::Profile; use Lintian::Relation::Version qw(versions_comparator); use Lintian::Reporting::ResourceManager; use Lintian::Reporting::Util qw(load_state_cache find_backlog); use Lintian::Util qw(copy_dir run_cmd locate_executable); my $CONFIG; my %OPT; my %OPT_HASH = ('reporting-config=s'=> \$OPT{'reporting-config'},); # ------------------------------ # Global variables and configuration # Some globals initialised in init_global() my ( $RESOURCE_MANAGER, $LINTIAN_VERSION, $timestamp, $TEMPLATE_CONFIG_VARS,$HARNESS_STATE_DIR, $HISTORY_DIR, $HISTORY, $GRAPHS, $LINTIAN_BASE, $HTML_TMP_DIR, $SCOUR_ENABLED, ); # FIXME: Should become obsolete if gnuplot is replaced by R like piuparts.d.o / # reproducible.d.n is using my $GRAPHS_RANGE_DAYS = 366; # ------------------------------ # Initialize templates # This only has to be done once, so do it at the start and then reuse the same # templates throughout. our %templates; # %statistics accumulates global statistics. For tags: errors, warnings, # experimental, overridden, and info are the keys holding the count of tags of # that sort. For packages: binary, udeb, and source are the number of # packages of each type with Lintian errors or warnings. For maintainers: # maintainers is the number of maintainers with Lintian errors or warnings. # # %tag_statistics holds a hash of tag-specific statistics. Each tag name is a # key, and its value is a hash with the following keys: count and overrides # (number of times the tag has been detected and overridden, respectively), and # packages (number of packages with at least one such tag). my (%statistics, %tag_statistics); # %by_maint holds a hash of maintainer names to packages and tags. Each # maintainer is a key. The value is a hash of package names to hashes. Each # package hash is in turn a hash of versions to an anonymous array of hashes, # with each hash having keys code, package, type, tag, severity, # extra, and xref. xref gets the partial URL of the maintainer page for that # source package. # # In other words, the lintian output line: # # W: gnubg source: substvar-source-version-is-deprecated gnubg-data # # for gnubg 0.15~20061120-1 maintained by Russ Allbery is # turned into the following structure: # # { 'gnubg' => { # '0.15~20061120-1' => [ # { code => 'W', # Either 'O' or same as $tag_info->code # pkg_info => { # package => 'gnubg', # version => '0.15~20061120-1', # component => 'main', # type => 'source', # anchor => 'gnubg_0.15~20061120-1', # xref => 'rra@debian.org.html#gnubg_0.15~20061120-1' # }, # tag_info => $tag_info, # an instance of Lintian::Tag::Info # archs => { # # Architectures we have seen this tag for # 'amd64' => 1, # 'i386' => 1, # }, # extra => 'gnubg-data' # } ] } } # # and then stored under the key 'Russ Allbery ' # # %by_uploader holds the same thing except for packages for which the person # is only an uploader. # # %by_tag is a hash of tag names to an anonymous array of tag information # hashes just like the inside-most data structure above. my (%by_maint, %by_uploader, %by_tag, %maintainer_table, %delta); my @attrs = qw(maintainers source-packages binary-packages udeb-packages errors warnings info experimental pedantic overridden groups-known groups-backlog classifications groups-with-errors); my @RESTRICTED_CONFIG_DIRS= split(/:/, $ENV{'LINTIAN_RESTRICTED_CONFIG_DIRS'}); my @CONFIG_DIRS = split(/:/, $ENV{'LINTIAN_CONFIG_DIRS'}); sub load_profile { my ($profile_name, $options) = @_; my %opt = ( 'restricted-search-dirs' => \@RESTRICTED_CONFIG_DIRS, %{$options // {}}, ); require Lintian::Profile; my $profile = Lintian::Profile->new; $profile->load($profile_name, \@CONFIG_DIRS, \%opt); return $profile; } sub required_cfg_value { my (@keys) = @_; my $v = $CONFIG; for my $key (@keys) { if (not exists($v->{$key})) { my $k = join('.', @keys); die("Missing required config parameter: ${k}\n"); } $v = $v->{$key}; } return $v; } sub required_cfg_non_empty_list_value { my (@keys) = @_; my $v = required_cfg_value(@keys); if (not defined($v) or ref($v) ne 'ARRAY' or scalar(@{$v}) < 1) { my $k = join('.', @keys); die("Invalid configuration: ${k} must be a non-empty list\n"); } return $v; } # ------------------------------ # Main routine sub main { my $profile = init_globals(); setup_output_dir( 'output_dir' => $HTML_TMP_DIR, 'lintian_manual' => "${LINTIAN_BASE}/doc/lintian.html", 'lintian_api_docs' => "${LINTIAN_BASE}/doc/api.html", 'lintian_log_file' => $ARGV[0], 'resource_dirs' => [map { "${LINTIAN_BASE}/reporting/$_"} qw(images resources)], ); load_templates("$LINTIAN_BASE/reporting/templates"); # Create lintian.css from a template, install the output file as a resource # and discard the original output file. We do this after installing all # resources, so the .css file can refer to resources. output_template( 'lintian.css', $templates{'lintian.css'}, { 'path_prefix' => '../' }); $RESOURCE_MANAGER->install_resource("$HTML_TMP_DIR/lintian.css"); my $state_cache = load_state_cache($HARNESS_STATE_DIR); print "Parsing lintian log...\n"; parse_lintian_log($profile, $state_cache); process_data($profile, $state_cache); exit(0); } # ------------------------------ # Utility functions sub init_globals { Getopt::Long::config('bundling', 'no_getopt_compat', 'no_auto_abbrev'); Getopt::Long::GetOptions(%OPT_HASH) or die("error parsing options\n"); if (not $OPT{'reporting-config'} or not -f $OPT{'reporting-config'}) { die("The --reporting-config parameter must point to an existing file\n" ); } $LINTIAN_BASE = $ENV{'LINTIAN_BASE'}; $CONFIG = YAML::XS::LoadFile($OPT{'reporting-config'}); $HARNESS_STATE_DIR = required_cfg_value('storage', 'state-cache'); $HTML_TMP_DIR = required_cfg_value('storage', 'reports-work-dir'); my $history_key = 'storage.historical-data-dir'; if (exists($CONFIG->{'storage'}{'historical-data-dir'})) { $HISTORY = 1; $HISTORY_DIR = required_cfg_value('storage', 'historical-data-dir'); print "Enabling history tracking as ${history_key} is set\n"; if (length locate_executable('gnuplot')) { $GRAPHS = 1; print "Enabling graphs (gnuplot is in PATH)\n"; } else { $GRAPHS = 0; print "No graphs as \"gnuplot\" is not in PATH\n"; } if ($GRAPHS) { if (locate_executable('scour')) { $SCOUR_ENABLED = 1; print "Minimizing generated SVG files (scour is in PATH)\n"; } else { $SCOUR_ENABLED = 0; print 'No minimization of generated SVG files' . " as \"scour\" is not in PATH\n"; } } } else { $HISTORY = 0; $GRAPHS = 0; print "History tracking is disabled (${history_key} is unset)\n"; print "Without history tracking, there will be no graphs\n"; } if (exists($CONFIG->{'template-variables'})) { $TEMPLATE_CONFIG_VARS = $CONFIG->{'template-variables'}; } else { $TEMPLATE_CONFIG_VARS = {}; } # Provide a default URL for the source code. It might not be correct for # the given installation, but it is better than nothing. $TEMPLATE_CONFIG_VARS->{'LINTIAN_SOURCE'} //= 'https://salsa.debian.org/lintian/lintian.git'; my $profile = load_profile(); Lintian::Data->set_vendor($profile); $LINTIAN_VERSION = $ENV{LINTIAN_VERSION}; $timestamp = safe_qx(qw(date -u --rfc-822)); chomp($LINTIAN_VERSION, $timestamp); $RESOURCE_MANAGER = Lintian::Reporting::ResourceManager->new('html_dir' => $HTML_TMP_DIR,); return $profile; } sub load_templates { my ($template_dir) = @_; for my $template ( qw/head foot clean index maintainer maintainers packages tag tags tags-severity tag-not-seen tags-all/ ) { open(my $fd, '<:encoding(UTF-8)', "${template_dir}/$template.tmpl"); my %options = (TYPE => 'FILEHANDLE', SOURCE => $fd); $templates{$template} = Text::Template->new(%options) or die "cannot load template $template: $Text::Template::ERROR\n"; close($fd); } open(my $fd, '<:encoding(UTF-8)', "${template_dir}/lintian.css.tmpl"); $templates{'lintian.css'} = Text::Template->new( TYPE => 'FILEHANDLE', SOURCE => $fd, DELIMITERS => ['{{{', '}}}'], ) or die("cannot load template for lintian.css: $Text::Template::ERROR\n"); close($fd); return; } sub process_data { my ($profile, $state_cache) = @_; my @maintainers = sort(uniq(keys(%by_maint), keys(%by_uploader))); my $statistics_file = "$HARNESS_STATE_DIR/statistics"; my ($old_statistics, $archives, @archive_info); { # Scoped to allow memory to be re-purposed. The %qa and %sources # structures are only used for a very few isolated items. my (%qa, %sources); print "Collecting statistics...\n"; $old_statistics = collect_statistics($profile, $state_cache, $statistics_file, \@maintainers,\%sources, \%qa); generate_lookup_tables(\%sources); write_qa_list(\%qa); generate_package_index_packages(\%sources); if ($HISTORY) { update_history_and_make_graphs(\@attrs, \%statistics, \%tag_statistics); } } # Build a hash of all maintainers, not just those with Lintian tags. We # use this later to generate stub pages for maintainers whose packages are # all Lintian-clean. my %clean; for my $group_id (sort(keys(%{$state_cache->{'groups'}}))) { my $maintainer = $state_cache->{'groups'}{$group_id}{'mirror-metadata'} {'maintainer'}; my $id; next if not $maintainer; $id = maintainer_url($maintainer); $clean{$id} = $maintainer; } # Now, walk through the tags by source package (sorted by maintainer). # Output a summary page of errors and warnings for each maintainer, output # a full page that includes info, experimental, and overridden tags, and # assemble the maintainer index and the QA package list as we go. for my $maintainer (@maintainers) { my $id = maintainer_url($maintainer); delete $clean{$id}; # Determine if the maintainer's page is clean. Check all packages for # which they're either maintainer or uploader and set $error_clean if # they have no errors or warnings. # # Also take this opportunity to sort the tags so that all similar tags # will be grouped, which produces better HTML output. my $error_clean = 1; for my $source ( keys %{ $by_maint{$maintainer} }, keys %{ $by_uploader{$maintainer} } ) { my $versions = $by_maint{$maintainer}{$source} || $by_uploader{$maintainer}{$source}; for my $version (keys %$versions) { $versions->{$version} = [sort by_tag @{ $versions->{$version} }]; next if not $error_clean; my $tags = $versions->{$version}; for my $tag (@$tags) { if ($tag->{code} eq 'E' or $tag->{code} eq 'W') { $error_clean = 0; last; } } } } # Determine the parts of the maintainer and the file name for the # maintainer page. my ($name, $email) = extract_name_and_email($maintainer); my $regular = "maintainer/$id"; my $full = "full/$id"; # Create the regular maintainer page (only errors and warnings) and the # full maintainer page (all tags, including overrides and info tags). print "Generating page for $id\n"; my $q_name = html_quote($name); my %data = ( email => html_quote(uri_escape($email)), errors => 1, id => $id, maintainer => html_quote($maintainer), name => $q_name, packages => $by_maint{$maintainer}, uploads => $by_uploader{$maintainer}, ); my $template; if ($error_clean) { $template = $templates{clean}; } else { $template = $templates{maintainer}; } output_template($regular, $template, \%data); $template = $templates{maintainer}; $data{errors} = 0; output_template($full, $template, \%data); my %index_data = (url => $id, name => $q_name); # Add this maintainer to the hash of maintainer to URL mappings. $maintainer_table{$maintainer} = \%index_data; } undef(@maintainers); # Write out the maintainer index. my %data = (maintainers => \%maintainer_table,); output_template('maintainers.html', $templates{maintainers}, \%data); # Now, generate stub pages for every maintainer who has only clean # packages. for my $id (keys %clean) { my $maintainer = $clean{$id}; my ($name, $email) = extract_name_and_email($maintainer); my %maint_data = ( id => $id, email => html_quote(uri_escape($email)), maintainer => html_quote($maintainer), name => html_quote($name), clean => 1, ); print "Generating clean page for $id\n"; output_template("maintainer/$id", $templates{clean}, \%maint_data); output_template("full/$id", $templates{clean}, \%maint_data); } # Create the pages for each tag. Each page shows the extended description # for the tag and all the packages for which that tag was issued. for my $tag (sort $profile->known_tags) { my $info = $profile->get_taginfo($tag); my $description = $info->description('html', ' '); my ($count, $overrides) = (0, 0); my $tmpl = 'tag-not-seen'; my $shown_count = 0; my $tag_list = $by_tag{$tag}; my $tag_limit_total = 1024; my $tag_limit_per_package = 3; if (exists $by_tag{$tag}) { $tmpl = 'tag'; $count = $tag_statistics{$tag}{'count'}; $overrides = $tag_statistics{$tag}{'overrides'}; $shown_count = $count + $overrides; } if ($shown_count > $tag_limit_total) { my (@replacement_list, %seen); for my $orig_info ( sort { $a->{pkg_info}{package} cmp $b->{pkg_info}{package} } @{$tag_list}) { my $pkg_info = $orig_info->{pkg_info}; my $key = "$pkg_info->{package} $pkg_info->{type} $pkg_info->{version}"; next if ++$seen{$key} > $tag_limit_per_package; push(@replacement_list, $orig_info); last if @replacement_list >= $tag_limit_total; } $tag_list = \@replacement_list; $shown_count = scalar(@replacement_list); } my %maint_data = ( description => $description, tag => $tag, code => $info->code, tags => $tag_list, shown_count => $shown_count, tag_limit_per_package => $tag_limit_per_package, graphs => $GRAPHS, graphs_days => $GRAPHS_RANGE_DAYS, statistics => { count => $count, overrides => $overrides, total => $count + $overrides, }, ); output_template("tags/$tag.html", $templates{$tmpl}, \%maint_data); } # Create the general tag indices. %data = ( tags => \%by_tag, stats => \%tag_statistics, profile => \$profile, ); output_template('tags.html', $templates{tags}, \%data); output_template('tags-severity.html', $templates{'tags-severity'}, \%data); output_template('tags-all.html', $templates{'tags-all'}, \%data); # Update the statistics file. open(my $stats_fd, '>', $statistics_file); print {$stats_fd} "last-updated: $timestamp\n"; for my $attr (@attrs) { print {$stats_fd} "$attr: $statistics{$attr}\n"; } print {$stats_fd} "lintian-version: $LINTIAN_VERSION\n"; close($stats_fd); $archives = required_cfg_value('archives'); for my $archive (sort(keys(%{$archives}))) { my $architectures = required_cfg_non_empty_list_value('archives', $archive, 'architectures'); my $components = required_cfg_non_empty_list_value('archives', $archive, 'components'); my $distributions = required_cfg_non_empty_list_value('archives', $archive, 'distributions'); my $path = required_cfg_value('archives', $archive, 'base-dir'); my $trace_basename = required_cfg_value('archives', $archive, 'tracefile'); # The path to the mirror timestamp. my $trace_file= "${path}/project/trace/${trace_basename}"; my $mirror_timestamp = path($trace_file)->slurp; $mirror_timestamp =~ s/\n.*//s; $mirror_timestamp = safe_qx('date', '-u', '--rfc-822', '-d', $mirror_timestamp); my %info = ( 'name' => $archive, 'architectures' => $architectures, 'components' => $components, 'distributions' => $distributions, 'timestamp' => $mirror_timestamp, ); push(@archive_info, \%info); } # Finally, we can start creating the index page. %data = ( delta => \%delta, archives => \@archive_info, previous => $old_statistics->{'last-updated'}, graphs => $GRAPHS, graphs_days => $GRAPHS_RANGE_DAYS, ); output_template('index.html', $templates{index}, \%data); return; } sub setup_output_dir { my (%args) = @_; my $output_dir = $args{'output_dir'}; my $lintian_manual = $args{'lintian_manual'}; my $lintian_api = $args{'lintian_api_docs'}; my $resource_dirs = $args{'resource_dirs'} // []; my $lintian_log_file = $args{'lintian_log_file'}; # Create output directories. mkdir($output_dir, 0777); mkdir("$output_dir/full", 0777); mkdir("$output_dir/maintainer", 0777); mkdir("$output_dir/tags", 0777); symlink('.', "$output_dir/reports"); copy_dir($lintian_manual, "$output_dir/manual"); copy_dir($lintian_api, "$output_dir/library-api"); if ($lintian_log_file) { my %opts = ( 'in' => $lintian_log_file, 'out' => "$output_dir/lintian.log.gz", ); run_cmd(\%opts, 'gzip', '-9nc'); $RESOURCE_MANAGER->install_resource("$output_dir/lintian.log.gz"); symlink($RESOURCE_MANAGER->resource_URL('lintian.log.gz'), "$output_dir/lintian.log.gz"); } for my $dir (@{$resource_dirs}) { next if not -d $dir; opendir(my $dirfd, $dir); for my $resname (readdir($dirfd)) { next if $resname eq '.' or $resname eq '..'; $RESOURCE_MANAGER->install_resource("$dir/$resname", { install_method => 'copy' }); } closedir($dirfd); } return; } sub collect_statistics { my ($profile, $state_cache, $statistics_file, $maintainers_ref, $sources_ref, $qa_list_ref) = @_; my $old_statistics; # For each of this maintainer's packages, add statistical information # about the number of each type of tag to the QA data and build the # packages hash used for the package index. We only do this for the # maintainer packages, not the uploader packages, to avoid # double-counting. for my $maintainer (@{$maintainers_ref}) { for my $source (keys %{ $by_maint{$maintainer} }) { my %count; for my $version ( sort versions_comparator keys %{ $by_maint{$maintainer}{$source} }){ my $tags = $by_maint{$maintainer}{$source}{$version}; for my $tag (@{$tags}) { $count{$tag->{code}}++; } if (@$tags) { $sources_ref->{$source}{$version} = $tags->[0]{pkg_info}{xref}; } } $qa_list_ref->{$source} = \%count; } } for my $tag ($profile->known_tags) { my ($count, $overrides) = (0, 0); my %seen_tags; next if (not exists($by_tag{$tag})); foreach (@{$by_tag{$tag}}) { if ($_->{code} ne 'O') { $count++; $seen_tags{$_->{pkg_info}{xref}}++; } else { $overrides++; } } $tag_statistics{$tag}{'count'} = $count; $tag_statistics{$tag}{'overrides'} = $overrides; $tag_statistics{$tag}{'packages'} = scalar(keys(%seen_tags)); } # Read in the old statistics file so that we can calculate deltas for # all of our statistics. if (-f $statistics_file) { ($old_statistics) = read_dpkg_control_lc($statistics_file); } $statistics{'groups-known'} = scalar(keys(%{$state_cache->{'groups'}})); $statistics{'groups-backlog'} = scalar(find_backlog($LINTIAN_VERSION,$state_cache)); my $pkgs_w_errors = 0; for my $group_data (values(%{$state_cache->{'groups'}})) { $pkgs_w_errors++ if exists($group_data->{'processing-errors'}) and $group_data->{'processing-errors'}; } $statistics{'groups-with-errors'} = $pkgs_w_errors; for my $attr (@attrs) { my $old = $old_statistics->{$attr} || 0; $statistics{$attr} ||= 0; $delta{$attr} = sprintf('%d (%+d)', $statistics{$attr},$statistics{$attr} - $old); } return $old_statistics; } sub extract_name_and_email { my ($maintainer) = @_; my ($name, $email) = ($maintainer =~ /^(.*) <([^>]+)>/); $name = 'Unknown Maintainer' unless $name; $email = 'unknown' unless $email; return ($name, $email); } # Generate the package lists. These are huge, so we break them into four # separate pages. # # FIXME: Does anyone actually use these pages? They're basically unreadable. sub generate_package_index_packages { my ($sources_ref) = @_; my %list = ( '0-9, A-F' => [], 'G-L' => [], 'M-R' => [], 'S-Z' => [], ); for my $package (sort(keys(%{$sources_ref}))) { my $first = uc(substr($package, 0, 1)); if ($first le 'F') { push(@{ $list{'0-9, A-F'} }, $package) } elsif ($first le 'L') { push(@{ $list{'G-L'} }, $package) } elsif ($first le 'R') { push(@{ $list{'M-R'} }, $package) } else { push(@{ $list{'S-Z'} }, $package) } } my %data = (sources => $sources_ref); my $i = 1; for my $section (sort(keys(%list))) { $data{section} = $section; $data{list} = $list{$section}; output_template("packages_$i.html", $templates{packages}, \%data); $i++; } return; } sub run_scour { my ($input_file, $output_file) = @_; run_cmd('scour', '-i',$input_file, '-o',$output_file, '-q', '--enable-id-stripping', '--enable-comment-stripping', '--shorten-ids', '--indent=none'); return 1; } sub update_history_and_make_graphs { my ($attrs_ref, $statistics_ref, $tag_statistics_ref) = @_; # Update history. my %versions; my $graph_dir = "$HTML_TMP_DIR/graphs"; my $commonf = "$graph_dir/common.gpi"; my $unix_time = time(); mkdir("$HISTORY_DIR") if (not -d "$HISTORY_DIR"); mkdir("$HISTORY_DIR/tags") if (not -d "$HISTORY_DIR/tags"); my $history_file = "$HISTORY_DIR/statistics.dat"; my $stats = ''; for my $attr (@{$attrs_ref}) { $stats .= ' ' . $statistics_ref->{$attr}; } open(my $hist_fd, '+>>', $history_file); print {$hist_fd} "$unix_time $LINTIAN_VERSION$stats\n"; if ($GRAPHS) { seek($hist_fd, 0, SEEK_SET); while (<$hist_fd>) { my @fields = split(); $versions{$fields[1]} = $fields[0] if not exists $versions{$fields[1]}; } } close($hist_fd); if ($GRAPHS) { mkdir("$graph_dir", 0777); mkdir("$graph_dir/tags", 0777); my $date_min = strftime('%s', localtime($unix_time - 3600 * 24 * $GRAPHS_RANGE_DAYS)); my $date_max = strftime('%s', localtime($unix_time)); # Generate loadable Gnuplot file with common variables and labels/arrows # for Lintian versions. open(my $common, '>', $commonf); print {$common} "history_dir='$HISTORY_DIR'\n"; print {$common} "graph_dir='$graph_dir'\n"; print {$common} "date_min='$date_min'\n"; print {$common} "date_max='$date_max'\n"; my $last_version = 0; for my $v (sort { $versions{$a} <=> $versions{$b} } keys %versions) { next unless $versions{$v} > $date_min; print {$common} "set arrow from '$versions{$v}',graph 0 to ", "'$versions{$v}',graph 1 nohead lw 0.4\n"; # Skip label if previous release is too close; graphs can't display # more than ~32 labels. my $min_spacing = 3600 * 24 * $GRAPHS_RANGE_DAYS / 32; if ($versions{$v} - $last_version > $min_spacing) { (my $label = $v) =~ s/\-[\w\d]+$//; print {$common} "set label '$label' at '$versions{$v}',graph ", "1.04 rotate by 90 font ',8'\n"; $last_version = $versions{$v}; } } close($common); print "Plotting global statistics...\n"; run_cmd({ 'chdir' => $graph_dir}, 'gnuplot',"$LINTIAN_BASE/reporting/graphs/statistics.gpi"); if ($SCOUR_ENABLED) { # Do a little "rename" dance to ensure that we keep the # "statistics.svg"-basename without having to use a # subdirectory. rename( "${graph_dir}/statistics.svg", "${graph_dir}/_statistics-orig.svg" ); run_scour( "${graph_dir}/_statistics-orig.svg", "${graph_dir}/statistics.svg" ); } $RESOURCE_MANAGER->install_resource("${graph_dir}/statistics.svg"); } my $gnuplot_fd; if ($GRAPHS) { open($gnuplot_fd, '>', "$graph_dir/call.gpi"); } for my $tag (sort(keys(%{$tag_statistics_ref}))) { $history_file = "$HISTORY_DIR/tags/$tag.dat"; $stats = $tag_statistics_ref->{$tag}; open(my $tag_fd, '>>', $history_file); print {$tag_fd} "$unix_time $stats->{'count'} $stats->{'overrides'} " ."$stats->{'packages'}\n"; close($tag_fd); if ($GRAPHS) { print {$gnuplot_fd} qq{print 'Plotting $tag statistics...'\n}; print {$gnuplot_fd} qq{call '$LINTIAN_BASE/reporting/graphs/tags.gpi' '$tag'\n}; print {$gnuplot_fd} qq{reset\n}; } } if ($GRAPHS) { my $svg_dir = "${graph_dir}/tags"; close($gnuplot_fd); run_cmd({'chdir' => $graph_dir}, 'gnuplot', 'call.gpi'); unlink($commonf); if ($SCOUR_ENABLED) { # Obvious optimization potential; run scour in parallel my $optimized_dir = "${graph_dir}/tags-optimized"; mkdir($optimized_dir); print "Minimizing tag graphs; this may take a while ...\n"; for my $tag (sort(keys(%{$tag_statistics_ref}))) { run_scour("${svg_dir}/${tag}.svg", "${optimized_dir}/${tag}.svg"); } $svg_dir = $optimized_dir; } for my $tag (sort(keys(%{$tag_statistics_ref}))) { my $graph_file = "${svg_dir}/${tag}.svg"; $RESOURCE_MANAGER->install_resource($graph_file); } path($graph_dir)->remove_tree if -d $graph_dir; } return; } # Write out the QA package list. This is a space-delimited file that contains # the package name and then the error count, warning count, info count, # pedantic count, experimental count, and overridden tag count. sub write_qa_list { my ($qa_data) = @_; open(my $qa_fd, '>', "$HTML_TMP_DIR/qa-list.txt"); for my $source (sort(keys(%{$qa_data}))) { print {$qa_fd} $source; for my $code (qw/E W I P X O/) { my $count = $qa_data->{$source}{$code} || 0; print {$qa_fd} " $count"; } print {$qa_fd} "\n"; } close($qa_fd); return; } # Generate a "redirect" lookup table for the webserver to power the # "/source/[/]" redirects. sub generate_lookup_tables { my ($sources_ref) = @_; mkdir("$HTML_TMP_DIR/lookup-tables"); open(my $table, '>', "$HTML_TMP_DIR/lookup-tables/source-packages"); foreach my $source (sort(keys(%{$sources_ref}))) { my $first = 1; for my $version ( sort versions_comparator keys %{ $sources_ref->{$source} }) { my $xref = $sources_ref->{$source}{$version}; print {$table} "$source full/$xref\n" if $first; print {$table} "$source/$version full/$xref\n"; $first = 0; } } close($table); return; } # Determine the file name for the maintainer page given a maintainer. It # should be .html where is their email address with all # characters other than a-z A-Z 0-9 - _ . @ = + replaced with _. Don't change # this without coordinating with QA. sub maintainer_url { my ($maintainer) = @_; if ($maintainer =~ m/<([^>]+)>/) { my $id = $1; $id =~ tr/a-zA-Z0-9_.@=+-/_/c; return "$id.html"; } else { return 'unsorted.html'; } } sub parse_lintian_log { my ($profile, $state_cache) = @_; # We take a lintian log file on either standard input or as the # first argument. This log file contains all the tags lintian # found, plus N: tags with informational messages. Ignore all the # N: tags and load everything else into the hashes we use for all # web page generation. # # We keep track of a hash from maintainer page URLs to maintainer # values so that we don't have two maintainers who map to the same # page and overwrite each other's pages. If we find two # maintainers who map to the same URL, just assume that the second # maintainer is the same as the first (but warn about it). # # The "last_*" are optimizations to avoid computing the same # things over and over again when a package have multiple tags. my (%seen, $last_info, $last_maintainer, %unknown_member_id, $info, $last_pi, %map_maint, %arch_map); my %expanded_code = ( E => 'errors', W => 'warnings', I => 'info', X => 'experimental', O => 'overridden', P => 'pedantic', C => 'classifications', ); while (<>) { my @parts; chomp; @parts = split_tag($_); next unless @parts; my ($code, $package, $type, $version, $arch, $tag, $extra) = @parts; $type = 'binary' unless (defined $type); next unless ($type eq 'source' || $type eq 'binary' || $type eq 'udeb'); # Ignore unknown tags - happens if we removed a tag that is # still present in the log file. my $tag_info = $profile->get_taginfo($tag); next if not $tag_info or $tag_info->effective_severity eq 'classification'; # Update statistics. my $key = $expanded_code{$code}; $statistics{$key}++; unless ($seen{"$package $type"}) { $statistics{"$type-packages"}++; $seen{"$package $type"} = 1; } # Determine the source package for this package and warn if # there appears to be no source package in the archive. # Determine the maintainer, version, and archive component. Work # around a missing source package by pulling information from # a binary package or udeb of the same name if there is any. my ($source, $component, $source_version, $maintainer, $uploaders); my $member_id = "${type}:${package}/${version}" . ($type ne 'source' ? "/$arch" : q{}); my $state_data = $state_cache->{'members-to-groups'}{$member_id}; next if exists($unknown_member_id{$member_id}); if ($type eq 'source') { $source = $package; $source_version = $version; if (not defined($state_data)) { warn "Source package ${member_id} not found in state-cache!\n"; $unknown_member_id{$member_id} = 1; } } elsif (defined($state_data)) { my $src_member = first { s/^source:// } keys(%{$state_data->{'members'}}); if ($src_member) { ($source, $source_version) = split(m{/}, $src_member, 2); } } elsif (not defined($state_data)) { warn "Package ${member_id} not found in state-cache!\n"; $unknown_member_id{$member_id} = 1; } $state_data //= {}; $component = $state_data->{'mirror-metadata'}{'component'} ||= 'main'; $maintainer = $state_data->{'mirror-metadata'}{'maintainer'} ||= '(unknown)'; $uploaders = $state_data->{'mirror-metadata'}{'uploaders'}; $source ||= ''; $version = 'unknown' unless (defined($version) and length($version) > 0); $source_version = $version unless (defined($source_version) and length($source_version) > 0); # Sanitize, just out of paranoia. $package =~ tr/a-zA-Z0-9.+-/_/c; $source =~ tr/a-zA-Z0-9.+-/_/c; $version =~ tr/a-zA-Z0-9.+:~-/_/c; $source_version =~ tr/a-zA-Z0-9.+:~-/_/c; # Conditionally call html_quote if needed. On average, 11-13% of # all tags (emitted on lintian.d.o) have no "extra". That would be # tags like "no-upstream-changelog" (now removed) if (defined($extra)) { $extra = html_quote($extra); } else { $extra = ''; } # Store binary architectures my $arch_key = join(':', $package, $type, $version, $tag, $extra); $arch_map{$arch_key}{$arch} = 1 unless $arch eq 'all' or $arch eq 'source'; # Don't duplicate entries if they only differ on architecture next if scalar(keys %{$arch_map{$arch_key}}) > 1; # Add the tag information to our hashes. Share the data # between the hashes to save space (which means we can't later # do destructive tricks with it). if ( $last_info && $last_pi->{type} eq $type && $last_pi->{package} eq $package && $last_pi->{version} eq $version) { # There are something like 622k tags emitted on lintian.d.o, # but only "some" 90k unique package+version(+arch) pairs. # Therefore, we can conclude that the average package will # have ~6 tags and optimise for that case. Indeed, this path # seems to be taken about 90% of the time (561k/624k). # - In fact, we see less than "90k" package+version(+arch) # pairs here, since entries without tags never this far down # in this loop (i.e. they are filtered out by split_tag # above). # Copy the last info and then change the bits that can change $info = {%{$last_info}}; # Code depends on whether the given tag was overridden or not $info->{code} = $code; $info->{extra} = $extra; if ($info->{tag_info}->name ne $tag) { $info->{tag_info} = $tag_info; } # saves a map_maintainer call $maintainer = $last_maintainer; } else { my $anchor = "${source}_${source_version}"; # Apparently "+" are not allowed in ids and I am guessing # ":" is not either if (index($anchor, '+') > -1 or index($anchor, ':') > -1) { $anchor =~ s/[+]/_x2b/g; $anchor =~ s/[:]/_x3a/g; } if (substr($maintainer, 0, 1) eq q{"}) { # Strip out ""-quotes, which is required in d/control for some # maintainers. $maintainer =~ s/^"(.*)" <(.*)>$/$1 <$2>/; } # Check if we've seen the URL for this maintainer before # and, if so, map them to the same person as the previous # one. $last_maintainer = $maintainer = map_maintainer(\%map_maint, $maintainer); # Update maintainer statistics. $statistics{maintainers}++ unless defined $by_maint{$maintainer}; $last_info = $info = { # Tag instance specific data # split_tags ensures that $code is a single upper case letter code => $code, tag_info => $tag_info, # extra is unsafe in general, but we already quote it above. extra => $extra, archs => $arch_map{$arch_key}, # Shareable data pkg_info => { package => $package, version => $version, # There is a check for type being in a fixed whitelist of # HTML-safe keywords in the start of the loop., type => $type, component => html_quote($component), # should be safe anchor => $anchor, xref => maintainer_url($maintainer). "#${anchor}", 'state_data' => $state_data, maintainer => html_quote($maintainer), }, }; $last_pi = $info->{pkg_info}; if (!$by_maint{$maintainer}{$source}{$source_version}) { my $list_ref = []; $by_maint{$maintainer}{$source}{$source_version} = $list_ref; # If the package had uploaders listed, also add the # information to %by_uploaders (still sharing the data # between hashes). if ($uploaders) { for my $uploader (@{$uploaders}) { if (substr($uploader, 0, 1) eq q{"}) { # Strip out ""-quotes, which is required in # d/control for some uploaders. $uploader =~ s/^"(.*)" <(.*)>$/$1 <$2>/; } $uploader = map_maintainer(\%map_maint, $uploader); next if $uploader eq $maintainer; $by_uploader{$uploader}{$source}{$source_version} = $list_ref; } } } } push(@{ $by_maint{$maintainer}{$source}{$source_version} }, $info); $by_tag{$tag} ||= []; push(@{ $by_tag{$tag} }, $info); } return; } # Deduplicate maintainers. Maintains a cache of the maintainers we've seen # with a given e-mail address and returns the maintainer string that we # should use (which is whatever maintainer we saw first with that e-mail). sub map_maintainer { my ($urlmap, $maintainer) = @_; my $url = maintainer_url($maintainer); if (defined(my $res = $urlmap->{$url})) { $maintainer = $res; } else { $urlmap->{$url} = $maintainer; } return $maintainer; } # Quote special characters for HTML output. sub html_quote { my ($text) = @_; $text ||= ''; # Use index to do a quick check before we bother requesting a # subst. On average, this is cheaper than blindly s///'ing, since # we rarely subst (all) of the characters below. if (index($text, '&') > -1) { $text =~ s/&/\&/g; } if (index($text, '<') > -1) { $text =~ s/') > -1) { $text =~ s/>/\>/g; } if (index($text, '/') > -1) { $text =~ s/\//\//g; } return $text; } # Given a file name, a template, and a data hash, fill out the template with # that data hash and output the results to the file. sub output_template { my ($file, $template, $data) = @_; my $path_prefix = $data->{path_prefix}; if (not defined($path_prefix)) { $path_prefix = ''; if (index($file, '/') > -1) { $path_prefix = '../' x ($file =~ tr|/||); } } $data->{version} ||= $LINTIAN_VERSION; $data->{timestamp} ||= $timestamp; $data->{by_version} ||= \&versions_comparator; $data->{path_prefix} ||= $path_prefix; $data->{html_quote} ||= \&html_quote; $data->{resource_path} ||= sub { return $path_prefix . $RESOURCE_MANAGER->resource_URL($_[0]); }; $data->{resource_integrity} ||= sub { return $RESOURCE_MANAGER->resource_integrity_value($_[0]); }; $data->{head} ||= sub { $templates{head}->fill_in( HASH => { page_title => $_[0], config_vars => $TEMPLATE_CONFIG_VARS, %{$data}, }) or die "Filling out head of $file: $Text::Template::ERROR\n"; }; $data->{foot} ||= sub { $templates{foot}->fill_in( HASH => { config_vars => $TEMPLATE_CONFIG_VARS, %{$data}, }) or die "Filling out footer of $file: $Text::Template::ERROR\n"; }; $data->{config_vars} ||= $TEMPLATE_CONFIG_VARS; open(my $fd, '>:encoding(UTF-8)', "$HTML_TMP_DIR/$file"); $template->fill_in(OUTPUT => $fd, HASH => $data) or die "filling out $file failed: $Text::Template::ERROR\n"; close($fd); return; } # Sort function for sorting lists of tags. Sort by package, version, component, # type, tag, and then any extra data. This will produce the best HTML output. # # Note that source tags must come before all other tags, hence the "unfair" # priority for those. This is because the first tags listed are assumed to # be source package tags. sub by_tag { my $a_pi = $a->{pkg_info}; my $b_pi = $b->{pkg_info}; if ($a_pi->{type} ne $b_pi->{type}) { return -1 if $a_pi->{type} eq 'source'; return 1 if $b_pi->{type} eq 'source'; } return $a_pi->{package} cmp $b_pi->{package} || $a_pi->{version} cmp $b_pi->{version} || $a_pi->{component} cmp $b_pi->{component} || $a_pi->{type} cmp $b_pi->{type} || $a->{tag_info}->name cmp $b->{tag_info}->name || $a->{extra} cmp $b->{extra}; } =item split_tag =cut { # Matches something like: (1:2.0-3) [arch1 arch2] # - captures the version and the architectures my $verarchre = qr,(?: \s* \(( [^)]++ )\) \s* \[ ( [^]]++ ) \]),xo; # ^^^^^^^^ ^^^^^^^^^^^^ # ( version ) [architecture ] # matches the full deal: # 1 222 3333 4444444 5555 666 777 # - T: pkg type (version) [arch]: tag [...] # ^^^^^^^^^^^^^^^^^^^^^ # Where the marked part(s) are optional values. The numbers above # the example are the capture groups. my $TAG_REGEX = qr/([EWIXOPC]): (\S+)(?: (\S+)(?:$verarchre)?)?: (\S+)(?:\s+(.*))?/; sub split_tag { my ($tag_input) = @_; my $pkg_type; return unless $tag_input =~ /^${TAG_REGEX}$/; # default value... $pkg_type = $3//'binary'; return ($1, $2, $pkg_type, $4, $5, $6, $7); } } 1; # Local Variables: # indent-tabs-mode: nil # cperl-indent-level: 4 # End: # vim: syntax=perl sw=4 sts=4 sr et