#!/usr/bin/perl -w

=pod

=head1 NAME

tv_grab_de - Grab TV listings for Germany and Austria.

=head1 SYNOPSIS

tv_grab_de [--help] [--output FILE] [--days N] [--offset N] [--quiet]

=head1 DESCRIPTION

Output TV listings for several channels available in German-speaking
countries.  The data comes from Gottfried SzingE<39>s database which
is exported to XMLTV format.  The default is to grab as many days as
possible from the current day onwards.

B<--output FILE> Write to FILE rather than standard output.

B<--days N> Grab N days starting from today, rather than as many as
possible.

B<--offset N> Start grabbing N days from today, rather than starting
today.  N may be negative.

B<--quiet> Suppress the progress messages normally written to standard
error.

B<--version> Show the version of the grabber.

B<--help> Print a help message and exit.

=head1 SEE ALSO

L<xmltv(5)>, L<http://www.szing.at/>, L<http://www.szing.at/xmltv/>

=head1 AUTHOR

Ed Avis, ed@membled.com

=head1 BUGS

The data produced has the same timezones as the upstream source.  At
present, that source is not handling daylight saving time correctly,
so the timezones in XML output are wrong.  (However, the times
themselves are correct, once you allow for the different timezone.)

The data source does not include full channels information and the
channels are identified by short names rather than the RFC2838 form
recommended by the XMLTV DTD.

During summer time, for a period of one hour either side of midnight
the calculation of 'today' may be wrong (could grab one hour too
little or 23 hours too much).  Should check whether this does in fact
happen.

=cut

use strict;

warn <<END
This grabber is currently unable to fetch listings, because the file
generation at the server has stopped.  When the server starts genating
files again, the grabber will work again.  Proceeding, but unless the
server has been fixed recently it will not work...

END
  ;

# Some users report that without 'use bytes' the script gives errors
# when run in a UTF-8 locale.  Which is crazy, because the XML files
# are ISO-8859-1 only.  The problem seems to have been introduced with
# version 1.25 of this program, that added a substitution of channel
# names.  I have no idea why this is, but not being able to reproduce
# the problem on my own machine I just work around it with:
#
use bytes;

use XMLTV::Version '$Id: tv_grab_de,v 1.35 2006/04/12 08:19:16 fgouget Exp $ ';

# We work by inheriting from XMLTV::Grab_XML and overriding certain
# methods.
#
use XMLTV::Grab_XML;
package Grab_XML_de;
use base 'XMLTV::Grab_XML';

# No need to be nice.
$XMLTV::Get_nice::Delay = 0;

use HTML::Entities qw(%entity2char);
use Date::Manip;
use XMLTV::DST;
use XMLTV::Gunzip;

# Todo: perhaps we should internationalize messages and docs?
sub country( $ ) {
    my $pkg = shift;
    return 'Germany/Austria';
}

# Assume the listings source uses CET (see BUGS above).
sub date_init( $ ) {
    my $pkg = shift;
    Date_Init('TZ=CET');
}

my $URL_BASE = 'http://yasd.cc/xmltv/';

# Returns a hash mapping YYYMMDD to URL.
sub urls_by_date( $ ) {
    my $pkg = shift;
    my $url = "${URL_BASE}index.lst";
    my $index = $pkg->get($url);
    die "could not get index page $url, aborting\n"
      if not defined $index;
    my %urls;
    foreach (split /\n/, $index) {
	if (not /^(tv_(\d{8})\.xml\.gz);(\d+)$/) {
	    warn "unexpected entry in index: $_, skipping\n";
	    next;
	}
	my ($filename, $date, $size) = ($1, $2, $3);
	if (defined $urls{$date}) {
	    warn "file for $date seen twice in index\n";
	}
	$urls{$date} = "$URL_BASE$filename";
    }
    return %urls;
}

# Given some data downloaded for a particular day, turn it into XML.
sub xml_from_data( $$ ) {
    my $pkg = shift;
    local $_ = gunzip(shift);
    if (not defined) {
	warn 'could not gunzip';
	return undef;
    }

    # Fix empty 'length' elements that wrongly appear in the XML.
    s!<length[^>]*></length>!!g;

    # The XML has a global replacement of & with ' und ', which works
    # to guarantee that bad entities never appear, but it's not really
    # the right answer.  Try to undo some of the damage (assume
    # ISO-8859-1).
    #
    s/ und amp;/&amp;/g;
    s/ und lt;/&lt;/g;
    s/ und gt;/&gt;/g;
    s/ und (\w+);/$entity2char{$1} or " und $1;"/ge;

    # Change the channel identifiers to make them more RFC2838-ish,
    # and add <channel> elements.
    #
    # This string manipulation is rather kludgy, but safe enough if we
    # know the input XML will follow certain patterns (no ]]>
    # nonsense) and it avoids a separate postprocessing stage.
    #
    my %ch_to_rfc2838;
    my $to_rfc2838 = sub( $ ) {
	my $c = shift;
	for ($ch_to_rfc2838{$c}) {
	    return $_ if defined;
	    $_ = lc $c . '.szing.at';
	    tr/ /-/;
	    return $_;
	}
    };

    s/channel="(.+?)">/'channel="' . $to_rfc2838->($1) . '">'/eg;
    my $make_ch_elem = sub {
	return <<END
  <channel id="$ch_to_rfc2838{$_}">
    <display-name>$_</display-name>
  </channel>
END
  ;
    };
    my $channels = join '', map { $make_ch_elem->() } sort keys %ch_to_rfc2838;
    s/  <programme/$channels  <programme/;

    # Remove occasional empty <desc>.
    s!<desc[^>]*>\s*</desc>!!g;

    # The timezones are incorrect during summer - they still give
    # +0100 instead of +0200.  Throw away the timezone information and
    # try to guess it in the same way as other grabbers.
    #
    s/(start|stop)="(\d+) \+\d+"/"$1=\"" . utc_offset($2, '+0100') . '"'/eg;

    return $_;
}

Grab_XML_de->go();
