#!/usr/bin/perl

##########################################################################
# dvd-duplicate-detector.pl
#
# Saves space and time when ripping DVDs with duplicated VOB files.
#
# Usage:
#
# 1) Prepopulate directory:
# isoinfo -l dev=1,0,0 | \
#              dvd-duplicate-detector.pl --deduplicate-directory BLAFASEL
# 2) Resume rip with an application that supports it
# 3) Profit
#
# For details seee:
# https://www.fabiankeil.de/gehacktes/dvd-duplicate-detector/
#
# Copyright (c) 2011-2014 Fabian Keil <fk@fabiankeil.de>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
##########################################################################

use strict;
use warnings;
use Getopt::Long;

my $complaints_are_us = 0;
my $duplicate_detection_debugging = 0;

sub parse_fail($) {
    my $line = shift;
    return unless $complaints_are_us;
    print "Can't parse: '$line'\n";
}

sub parse_win($) {
    my $line = shift;
    return unless $complaints_are_us;
    print "Pleased to parse:    '$line'\n";
}

# XXX: This is a bit simplified and will only detect duplicates
#      that start at the same sector, it will not detect duplicates
#      with overlapping ranges but different start sectors.
#      I haven't seen those yet, but wouldn't be surprised if they'd exist.
sub deduplicate_or_prepare_directory($$) {
    my $dvd_directory = shift;
    my $expected_files = shift;
    my %locations;
    my %originals;
    my %duplicates;

    if (defined $dvd_directory) {
        unless(-d $dvd_directory) {
            print "Creating $dvd_directory\n";
            mkdir($dvd_directory) or die "Failed to create $dvd_directory: $!\n";
        }
        chdir($dvd_directory) or die "Failed to cd into $dvd_directory: $!\n";
    }
    
    foreach my $file (keys %{$expected_files}) {
        my $location = $expected_files->{$file}{location};
        die "Location for $file unknown" unless defined $location;
        $locations{$location}{$file}++;

    }

    foreach my $location (keys %locations) {
        my $original = undef;

        print "Files starting at '$location':\n" if $duplicate_detection_debugging;

        foreach my $file (sort keys %{$locations{$location}}) {
            unless (defined $original) {
                print "$file (original)\n" if $duplicate_detection_debugging;
                $original = $file;
                $originals{$original}++;
            } else {
                print "$file (copy of $original)\n" if $duplicate_detection_debugging;
                $duplicates{$file} = $original;
            }
        }
        print "\n" if $duplicate_detection_debugging;
    }

    print "Master files:\n";
    foreach my $file (sort keys %originals) {
        print "$file\n";
    }

    print "Duplicates that can be replaced with links:\n";
    foreach my $file (sort keys %duplicates) {
        my $original = $duplicates{$file};

        die "$original isn't a original file but should be" unless defined $originals{$original};
        die "$file seems to be an orignal but is counted as duplicate" if defined $originals{$file};
        print "$file -> $original\n";
        if (defined $dvd_directory) {
            print "Checking existence of $file\n";
            if (-f $file) {
                print "Ditching $file\n";
                unlink($file) or die "Failed to unlink $file: $!";
            }
            unless (-f $original) {
                my $directory = 'VIDEO_TS';
                print "The original $original doesn't exist yet. Creating a placeholder.\n";
                unless (-d $directory) {
                    mkdir($directory) or die "Failed to create $directory: $!\n";
                }
                open(my $fd, '>', $original) or die "Failed to create $original: $!\n";
            }
            print "Linking $file to $original\n";
            # Link failures are not considered worth dieing even though they
            # shouldn't happen in normal circumstances.
            link($original, $file) or print "Failed to link $file to $original: $!\n";
        }
    }
}

# Parse the output of isoinfo -l read from <>
sub parse_isoinfo() {
    my $directory = '';
    my %expected_files;

    while (<>) {
        chomp;

        next unless length();

        if (/^Directory listing of \/(.*)/) {
            # Directory listing of /VIDEO_TS/
            $directory = $1 ? $1 : "/";
            parse_win($_);

        } elsif (/^[-rwx]{10}\s*\d+\s*\d+\s*\d+\s*(\d+).*\[\s*(\d+)\s*(\d+)\]\s*([\d\w._]*?)(;1)?\s*$/) {

            # -r-xr-xr-x   0    0    0      32768 Aug 22 2012 [  62200 00] VIDEO_TS.BUP;1 
            # ----------   0    0    0 1073739776 Aug  1 2005 [ 750709 00] VTS_13_2.VOB;1
            # ----------   0    0    0 1073739776 Aug  1 2005 [1274996 00] VTS_13_3.VOB;1
            my $length    = $1;
            my $location  = $2;
            my $name      = $directory . $4;

            #print "$name is $length long and starts at $location\n";

            die "Duplicate file $name?" if (defined $expected_files{$name});

            $expected_files{$name}{length}   = $length;
            $expected_files{$name}{location} = $location;

            parse_win($_);

        } elsif (/^d----------/) {

            # d---------   0    0    0       2048 Aug  1 2005 [    259 02] ..

            # We can ignore directory duplicates

        } else {

            parse_fail($_);
        }
    }

    return \%expected_files;
}

sub usage() {
    print "dvd-deduplicate-detecter.pl [--deduplicate-directory directory] < isoinfo_l_output\n";
}

sub main() {
    my $expected_files;
    my $dvd_directory;

    GetOptions('deduplicate-directory=s' => \$dvd_directory,
               'help' => sub{ usage(); exit(0)}
    ) or usage() and exit(1);

    $expected_files = parse_isoinfo();

    if (defined $expected_files) {
        deduplicate_or_prepare_directory($dvd_directory, $expected_files);
    }
}

main();

=head1 NAME

B<dvd-duplicate-detector> - Deduplicates DVD rips

=head1 SYNOPSIS

B<dvd-duplicate-detector> [B<--deduplicate-directory> I<directory>]

B<dvd-duplicate-detector> B<--help>

=head1 DESCRIPTION

B<dvd-duplicate-detector> deduplicates DVD rips with duplicated files
and prepopulates directories to speedup ripping of DVDs with duplicated
files (only works if the rip software supports resuming).

To detect duplicates, the isoinfo(8) output is read from STDIN.

The duplicate detection is a bit simplified and will only detect duplicated
files that start at the same sector, it will not detect duplicated files
with overlapping ranges which are theoretically possible but apparently
not yet used in the real world.

=head1 OPTIONS

B<--deduplicate-directory> I<directory> Deduplicate an already existing
DVD rip in the directory by replacing duplicates with hardlinks or
prepopulate the directory with hardlinks for duplicated files, so
the content only has to be ripped once.

B<--help> Show usage.

If no option is specified, the isoinfo output is read from STDIN,
duplicates are analyzed and the results are shown, but nothing
is written to the disk.

=head1 EXAMPLES

To look for duplicates:

isoinfo -l dev=1,0,0 | dvd-duplicate-detector

To deduplicate or prepopulate a directory:

isoinfo -l dev=1,0,0 | dvd-duplicate-detector --deduplicate-directory BLAFASEL

The examples assume that 1,0,0 is the address of the DVD drive,
use cdrecord --scanbus to find the correct address on your system.

=head1 SEE ALSO

cdrecord(8) isoinfo(8)

=head1 AUTHOR

Fabian Keil <fk@fabiankeil.de>

https://www.fabiankeil.de/gehacktes/dvd-duplicate-detector/

=cut
