#!/usr/bin/perl

#By Gabor Szabo
#Patch by Orna Agmon
#Partial rewrite by Eli Billauer (this soup has many cooks)
use strict;
use warnings;

my $VERSION = '0.01';

use HTML::Parser;
use POSIX qw(mktime strftime);

my $dir     = $ARGV[0];
my $publicdir = $ARGV[1];
my $outfile = $ARGV[2];
my $msgfile = $ARGV[3] or die "Usage: $0 future-directory-name public-dir rss2-file announcement-file\n";
my $now = time();
my $pubdate = POSIX::strftime("%a, %d %b %Y %H:%M:%S %z", localtime($now));

my $td;     # counting the td elements
my %data;   # hold all the data about one date
my @tree;   # stack of current items

my %nextdata;
my $firstentry = 1;
my $nextlectures = "";

my %entities = ( # For escaping
		'&' => '&amp;',
		'<' => '&lt;',
		'>' => '&gt;',
		"'" => '&apos;',
		'"' => '&quot;'
	       );

open (OUT, ">$outfile") or die("Failed to open \"$outfile\" for write\n");
open (MESSAGE, ">$msgfile") or die("Failed to open \"$msgfile\" for write\n");

print OUT <<"ENDOFHEADER";
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
<channel>
<title>Haifux</title>
<link>http://www.haifux.org/</link>
<description>Haifux Linux Club</description>
<language>en-us</language>
<pubDate>$pubdate</pubDate>
<generator>Haifux homecooked RSS generator</generator>
<ttl>360</ttl>
ENDOFHEADER

FILE: foreach my $input_file (<$dir/*>) {
	my %next;
        next FILE if ($input_file =~ /index\.htm/i);

	$td = 0;
	%data = (comment => '');
	
	my $p = HTML::Parser->new( api_version => 3,
			start_h => [\&start, "tagname, attr"],
			end_h   => [\&end,   "tagname"],
			text_h  => [\&text,  "dtext"],
			marked_sections => 1,
	);
	
	$p->parse_file($input_file);

	unless (defined $data{lecture_link}) {
	  warn("Warning: Lecture number $data{id} skipped because it doesn't have a lecture link.\n") if (defined $data{id});
	  next FILE;
	}	
	
	unless (open (L, "$publicdir/$data{lecture_link}/index.html")) {
	  warn("Warning: No abstract for lecture number $data{id} because \"$publicdir/$data{lecture_link}/index.html\" could not be opened for read\n");
	} else {

	  local $/; # Enable slurp mode.
	  my $lecdata = <L>;
	  close L;

	  # This regular expression takes the smallest <p> to </p> snipped, which
	  # comes after a </h2>. Let's hope it's the abstract.

	  ($data{abstract}) = ($lecdata =~ /<\/h2>.*?<p>[ \t\n\r]*(.*?)[ \t\n\r]*<\/p>/is);

	  # The first abstract we encounter is the one we encounter first
	  $next{abstract} = $data{abstract} || "To be published.";

	  unless (defined $data{abstract}) {
	    warn("Warning: No abstract was resolved from \"$publicdir/$data{lecture_link}. (Not published on billboards)\".\n");
	    warn("(I expected to find a </h2> and then a <p>...Text...</p>)\n");
	  }	
	}
	$data{abstract} = "" unless (defined $data{abstract});
	$data{abstract} = xmlescape($data{abstract});

	foreach my $x (qw [id date lecture_link lecture_text person_text ]) {
	  # If there isn't something meaningful in all entries, it's not
	  # mature for publishing yet (in particular, as this goes to the
	  # Technion bill board database)

	  unless ((defined $data{$x}) && ($data{$x} =~ /[^ \t\n\r]/)) {
	    warn("Warning: Lecture number $data{id} skipped because it doesn't have a $x entry\n");
	    next FILE;
	  }
	  $next{$x} = $data{$x};
	  $data{$x} = xmlescape($data{$x});
	}

	my ($day, $month, $year) = ($data{date} =~ /(\d+)\D+(\d+)\D+(\d+)/);
	unless (defined $year) {
	  warn("Warning: Lecture number $data{id} skipped because its date couldn't be resolved\n");
	  next FILE;
	}

	$year = 2000 + $year if ($year < 100);	

	my $sqldate = sprintf("%4d-%02d-%02d 18:30:00",$year,$month,$day);
	$next{nicedate} = POSIX::strftime("%A, %B ", 0, 0, 0, $day, $month-1, $year-1900);
	$next{nicedate} .= $day;
	if (($day== 1) || ($day== 21) || ($day== 31)) {
	  $next{nicedate} .= 'st';
	} elsif (($day == 2) || ($day == 22)) {
	  $next{nicedate} .= 'nd';
	} elsif (($day == 3) || ($day == 23)) {
	  $next{nicedate} .= 'rd';
	} else {
	  $next{nicedate} .= 'th';
	}
  
	if ($firstentry) {
	  %nextdata = %next;
	  $firstentry = 0;
	} else {
	  my $line = sprintf('%02d/%02d/%02d',$day, $month, $year % 100)." $next{lecture_text}: $next{person_text}";
	  $line =~ s/<.*?>//gs; # Remove HTML tags
	  $line =~ s/[ \t\n\r]+/ /gs; # Single whitespaces

	  $line = join "\n         ", ($line =~ /(.{1,68})(?: |$)/g);
          $nextlectures .= "$line\n";
	}
        print OUT <<"ITEMEND";
<item>
<title>$data{date}: $data{lecture_text} ($data{person_text})</title>
<link>http://www.haifux.org/$data{lecture_link}</link>
<description>$data{abstract}</description>
<pubDate>$pubdate</pubDate>
<guid isPermaLink="true">http://www.haifux.org/$data{lecture_link}</guid>
<haifuxdate>$sqldate</haifuxdate>
<haifuxtitle>$data{lecture_text}</haifuxtitle>
<haifuxabstract>$data{abstract}</haifuxabstract>
<haifuxspeaker>$data{person_text}</haifuxspeaker>
<haifuxlink>http://www.haifux.org/$data{lecture_link}</haifuxlink>
<haifuxplace>Taub 6</haifuxplace>
</item>
ITEMEND

}

print OUT "</channel>\n</rss>\n";
close(OUT);

$nextdata{abstract} =~ s/<.*?>//gs; # Remove HTML tags
$nextdata{abstract} =~ s/[ \t\n\r]+/ /gs; # Single whitespaces

$nextlectures = "There are no scheduled lectures.\n"
  unless ($nextlectures);

print MESSAGE<<"MSGEND";
On $nextdata{nicedate} at 18:30, Haifux will gather to hear a talk by $nextdata{person_text}:

   $nextdata{lecture_text}

Abstract

$nextdata{abstract}
   
=================================================================

We meet in Taub building, room 6. For instructions see:
http://www.haifux.org/where.html

Attendance is free, and you are all invited!

==================================================================
Future lectures:

$nextlectures

==================================================================

We are always interested in hearing your talks and ideas. If you wish to give a talk, hold a discussion, or just plan some event haifux might be interested in, please contact us at webmaster\@haifux.org 
MSGEND
close MESSAGE;
exit; # so we know here is the end of it :)


sub start {
	my ($tagname, $attr) = @_;
	$td++ if $tagname eq "td";#count the table cell we are in
	push @tree, $tagname;

	if ($td == 2 and $tagname eq "a") { 
	  $data{lecture_link} = $attr->{href};
	}
	if ($td == 3 and $tagname eq "a") { 
	  $data{person_link}  = $attr->{href};
	}
	if ($td == 5 and $tagname eq "a") {
	  $data{comment}     .= qq(&lt;a href=&quot;$attr->{href}&quot;&gt;);
#	  print "1comment $data{'comment'}";
	}
}

sub end {
	my ($tagname) = @_;
	pop @tree || die "Not symmetric\n";
	if ($td == 5 and $tagname eq "a") {
	  $data{comment}     .= qq(&lt;/a&gt;);
#	  print "2comment $data{'comment'}";
	}
}

sub text {
	my ($text) = @_;
	return if not @tree;
	#work around - html parser interprets nbsp as char 160,
	# it should be ' '
	my $nbsp=chr(160);
	$text=~s/$nbsp/ /g;

# Eli, 16.2.13. Don't be picky about what's in <td>'s
	if ($td == 1 and $tree[-1] eq "td") { $data{id}           = $text; }
#	if ($td == 2 and $tree[-1] eq "a")  { $data{lecture_text} = $text; }
#	if ($td == 3 and $tree[-1] eq "a")  { $data{person_text}  = $text; }
	if ($td == 2 and (($tree[-1] eq "a") or ($tree[-1] eq "td")))
          { $data{lecture_text} = $text; }
	if ($td == 3 and (($tree[-1] eq "a") or ($tree[-1] eq "td"))) 
          { $data{person_text}  = $text; }
	if ($td == 4 and $tree[-1] eq "td" or $tree[-1] eq "div") {
	  $data{date}         = $text;
	}
	if ($td == 5 and $tree[-1])         { $data{comment}     .= $text; }
}

sub xmlescape {
  my ($x) = @_;  

  $x =~ s/([&<>\'\"])/$entities{$1}/ge;
  # As an extra service, change all whitespace sequences to a single space

  $x =~ s/[ \t\n\r]+/ /g;
  ($x) = ($x =~ /^[ ]*(.*?)[ ]*$/);
  
  return $x;
}

