#!/bin/sh
# SPDX-FileCopyrightText: 2020-2026  Jonas Smedegaard <dr@jones.dk>
#
# SPDX-FileCopyrightText: 2020-2021  Purism, SPC
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
# Description: helper script to update copyright_hints
#
# Depends:
#  licensecheck,
#  libimage-exiftool-perl,
#  libipc-system-simple-perl,
#  libpath-tiny-perl,
#  libregexp-assemble-perl,
#  perl,

set -eu

EXT_skip='alaw mulaw mjr'
RE_skip='fuzzers/corpora/.*'
EXT_meta='jpeg jpg png mp4 opus webp'

ext2re() { perl -sE 'printf ".*\.(%s)", join "|", split " ", $e' -- -e="$1"; }
ext2opt() { perl -sE 'say map {" $o $_"} split " ", $e' -- -o="$1" -e="$2"; }

RE_skip=$(ext2re "$EXT_skip")${RE_skip:+|$RE_skip}
RE_meta=$(ext2re "$EXT_meta")${RE_meta:+|$RE_meta}
RE_debian='debian/(changelog|copyright(_hints)?|source/lintian-overrides)'
RE_hint='skip|meta'

# cleanup stray hint files from a previous run
find ./* -type f -regextype posix-egrep -regex "^\./.*:($RE_hint)$" -delete

echo 'skip binary files without parsable metadata ...' 1>&2
find ./* -type f -regextype posix-egrep -regex "^\./($RE_skip)$" \
 -exec sh -c 'echo "License: UNKNOWN" > "$1:skip"' shell {} ';'

echo 'extract metadata from binary files ...' 1>&2
exiftool '-textOut!' %d%f.%e:meta -short -short -recurse -extractEmbedded \
 $(ext2opt -ext "$EXT_meta") \
 -- ./*

# resolve file regex from contained license or shebang, or path regex
_file_regex() {
    perl -Mv5.36 -MGetopt::Long=:config,gnu_getopt -MPath::Tiny -MRegexp::Assemble -MList::Util=any -- - "$@" <<'EOF'
my (%opt, @license, @files, @match);
GetOptions ( \%opt, "shortname=s@", "grantglob=s@", "regex=s@", "shebang=s@", "nonverb=s@" );
my @section = split /\n\n+/, path("debian/copyright")->slurp_utf8;
for my $name ( @{ $opt{shortname} // [] } ) {
    push @license, map {/^License:\h*\Q$name\E\n\h+(\S[^\n]*(?:\n\h+\S[^\n]*)*)/} @section; }
for my $glob ( @{ $opt{grantglob} // [] } ) {
    push @files, grep {/^Files:\n?\h(?:\S[^\n]*\n?\h)*\Q$glob\E\s/} @section; }
my @grant = map {/^License(?:-Grant:|:\h*\S[^\n]*)\h*\n\h+(\S[^\n]*(?:\n\h+\S[^\n]*)*)/mg} @files;
my @firstline_re = map {qr/^\Q$_\E/} @{ $opt{shebang} // [] };
my $nonverb_re = Regexp::Assemble->new
    ->add("\\W+", map {"\\W+(?:$_)\\W+"} @{ $opt{nonverb} // [] })
    ->as_string;
my @content_re = map {
    s/\W+/[**]/g;
    s/\Q[**]\E\d\Q[**]\E/[**]\\S{0,2}[**]/g;
    s/\Q[**]\E/$nonverb_re/g;
    qr/$_/
} @license, @grant;
my $inspect = sub {
    return if $_[0]->is_dir;
    if (@firstline_re) {
        my ($head) = $_[0]->lines({ count => 1 });
        push @match, quotemeta($_[0]) and return
            if any { $head =~ $_ } @firstline_re;
    }
    push @match, quotemeta($_[0])
        if any { $_[0]->slurp_raw =~ $_ } @content_re;
};
for (@ARGV) {
    my $p = path($_);
    $p->is_dir ? $p->visit($inspect, { recurse => 1 }) : $inspect->($p);
}
my $files_re = Regexp::Assemble->new->add(@match, @{ $opt{regex} // [] });
print $files_re->as_string =~ s/\(\?:/\(/gr;
EOF
}

RE_SKIP="$RE_skip|$RE_meta"

# licensing patterns misdetected by licensecheck
RE_janus=$(_file_regex --grantglob '*' -- *)

# TODO: automate more of this manual cleanup:
#  * strip garbage copyright holders
#  * optionally merge equally licensed Files sections
#  * do "sort -k2 -k1,1 -u" on copyright holders
#  * merge copyright years for each copyright holder
# TODO: strip files matching glob in current (only, no later) section
_licensecheck() {
    perl -Mv5.36 -MGetopt::Long=:config,gnu_getopt \
    -MIPC::System::Simple=capture -MList::Util=uniq \
    -- - --hint="$RE_hint" "$@" <<'EOF' >> debian/copyright_hints
my %opt;
GetOptions ( \%opt, "merge-licenses",
    "hint=s", "check=s", "ignore=s", "shortname=s", "subset=s" );
my @subset = exists $opt{subset} ? split(" ", $opt{subset}) : ();
my $subset_globs = join( "\n ", @subset );
if    ( $subset_globs =~ /^[*]$/ )    { say STDERR "check default section(s) ..." }
elsif ( @subset and $opt{shortname} ) { say STDERR "check $opt{shortname} section(s) @subset ..." }
elsif (@subset)                       { say STDERR "check section(s) @subset ..." }
elsif ( $opt{shortname} )             { say STDERR "check $opt{shortname} section(s) ..." }
else                                  { say STDERR "check remaining upstream section(s) ..." }
my @cmd = ( qw(licensecheck --copyright --deb-machine --recursive --lines 0),
    "--check",  $opt{check}, "--ignore", $opt{ignore},
    ($opt{"merge-licenses"} ? "--merge-licenses" : ()), "--" );
say STDERR "@cmd *" if $ENV{DEBUG};
local $_ = capture( @cmd, glob "*" );
if ( !$ENV{NOGLOBMERGE} and grep /[*]/, @subset ) { s/^.*?\n\nFiles: \K.*?(?=\n\w)/$subset_globs/s }
elsif (@subset)                                   { s/^.*?\n\nFiles: \K/$subset_globs\n /s }
s/^.*?\n\n//s if exists $opt{subset} and (!@subset or $subset[0] ne "*");
s/^Files:\K /\n /mg;
s/^[C]opyright:\K /\n  /mg;
s/(?:(?<=^  )|(?<=\d{4})),\K (?=\d{4})//mg;
s/:(?:$opt{hint})$//mg if $opt{hint};
if ($opt{shortname}) { s/^License: \K(.*)/ join " and\/or ", uniq sort grep( !m{\AUNKNOWN\Z}, split(" and\/or ", $1), $opt{shortname} ) /mge }
print;
EOF
}

rm -f debian/copyright_hints

# initially, check all to know roughly what to group and in which order
#rm -f debian/copyright_hints
#_licensecheck '' --check '.*' --ignore "^($RE_SKIP|debian/.*)$"
#exit 0

# check default licensed files first
_licensecheck --shortname 'GPL-3 with OpenSSL exception' --subset '*' --check "^($RE_janus)$" --ignore "^($RE_SKIP|debian/.*)$"

# check generally
#  * omit non-copyright-protected Debian files
_licensecheck --subset '' --check '.*' --ignore "^($RE_janus|$RE_SKIP|debian/.*)$"
_licensecheck --subset 'debian/*' --check '^debian/' --ignore "^($RE_janus|$RE_SKIP|$RE_debian)$"

# cleanup hint files
find ./* -type f -regextype posix-egrep -regex "^\./.*:($RE_hint)$" -delete
