#!/usr/bin/perl
#By Gabor Szabo
#Patch by Orna Agmon
#Partial rewrite by Eli Billauer (this soup has many cooks)
use strict;
use warnings;
my $VERSION = '0.01';
use HTML::Parser;
use POSIX qw(mktime strftime);
my $dir = $ARGV[0];
my $publicdir = $ARGV[1];
my $outfile = $ARGV[2];
my $msgfile = $ARGV[3] or die "Usage: $0 future-directory-name public-dir rss2-file announcement-file\n";
my $now = time();
my $pubdate = POSIX::strftime("%a, %d %b %Y %H:%M:%S %z", localtime($now));
my $td; # counting the td elements
my %data; # hold all the data about one date
my @tree; # stack of current items
my %nextdata;
my $firstentry = 1;
my $nextlectures = "";
my %entities = ( # For escaping
'&' => '&',
'<' => '<',
'>' => '>',
"'" => ''',
'"' => '"'
);
open (OUT, ">$outfile") or die("Failed to open \"$outfile\" for write\n");
open (MESSAGE, ">$msgfile") or die("Failed to open \"$msgfile\" for write\n");
print OUT <<"ENDOFHEADER";
Haifux
http://www.haifux.org/
Haifux Linux Cluben-us$pubdateHaifux homecooked RSS generator360
ENDOFHEADER
FILE: foreach my $input_file (<$dir/*>) {
my %next;
next FILE if ($input_file =~ /index\.htm/i);
$td = 0;
%data = (comment => '');
my $p = HTML::Parser->new( api_version => 3,
start_h => [\&start, "tagname, attr"],
end_h => [\&end, "tagname"],
text_h => [\&text, "dtext"],
marked_sections => 1,
);
$p->parse_file($input_file);
unless (defined $data{lecture_link}) {
warn("Warning: Lecture number $data{id} skipped because it doesn't have a lecture link.\n") if (defined $data{id});
next FILE;
}
unless (open (L, "$publicdir/$data{lecture_link}/index.html")) {
warn("Warning: No abstract for lecture number $data{id} because \"$publicdir/$data{lecture_link}/index.html\" could not be opened for read\n");
} else {
local $/; # Enable slurp mode.
my $lecdata = ;
close L;
# This regular expression takes the smallest
to
snipped, which
# comes after a . Let's hope it's the abstract.
($data{abstract}) = ($lecdata =~ /<\/h2>.*?
[ \t\n\r]*(.*?)[ \t\n\r]*<\/p>/is);
# The first abstract we encounter is the one we encounter first
$next{abstract} = $data{abstract} || "To be published.";
unless (defined $data{abstract}) {
warn("Warning: No abstract was resolved from \"$publicdir/$data{lecture_link}. (Not published on billboards)\".\n");
warn("(I expected to find a and then a
...Text...
)\n");
}
}
$data{abstract} = "" unless (defined $data{abstract});
$data{abstract} = xmlescape($data{abstract});
foreach my $x (qw [id date lecture_link lecture_text person_text ]) {
# If there isn't something meaningful in all entries, it's not
# mature for publishing yet (in particular, as this goes to the
# Technion bill board database)
unless ((defined $data{$x}) && ($data{$x} =~ /[^ \t\n\r]/)) {
warn("Warning: Lecture number $data{id} skipped because it doesn't have a $x entry\n");
next FILE;
}
$next{$x} = $data{$x};
$data{$x} = xmlescape($data{$x});
}
my ($day, $month, $year) = ($data{date} =~ /(\d+)\D+(\d+)\D+(\d+)/);
unless (defined $year) {
warn("Warning: Lecture number $data{id} skipped because its date couldn't be resolved\n");
next FILE;
}
$year = 2000 + $year if ($year < 100);
my $sqldate = sprintf("%4d-%02d-%02d 18:30:00",$year,$month,$day);
$next{nicedate} = POSIX::strftime("%A, %B ", 0, 0, 0, $day, $month-1, $year-1900);
$next{nicedate} .= $day;
if (($day== 1) || ($day== 21) || ($day== 31)) {
$next{nicedate} .= 'st';
} elsif (($day == 2) || ($day == 22)) {
$next{nicedate} .= 'nd';
} elsif (($day == 3) || ($day == 23)) {
$next{nicedate} .= 'rd';
} else {
$next{nicedate} .= 'th';
}
if ($firstentry) {
%nextdata = %next;
$firstentry = 0;
} else {
my $line = sprintf('%02d/%02d/%02d',$day, $month, $year % 100)." $next{lecture_text}: $next{person_text}";
$line =~ s/<.*?>//gs; # Remove HTML tags
$line =~ s/[ \t\n\r]+/ /gs; # Single whitespaces
$line = join "\n ", ($line =~ /(.{1,68})(?: |$)/g);
$nextlectures .= "$line\n";
}
print OUT <<"ITEMEND";
$data{date}: $data{lecture_text} ($data{person_text})
http://www.haifux.org/$data{lecture_link}
$data{abstract}$pubdatehttp://www.haifux.org/$data{lecture_link}$sqldate$data{lecture_text}$data{abstract}$data{person_text}http://www.haifux.org/$data{lecture_link}Taub 6
ITEMEND
}
print OUT "\n\n";
close(OUT);
$nextdata{abstract} =~ s/<.*?>//gs; # Remove HTML tags
$nextdata{abstract} =~ s/[ \t\n\r]+/ /gs; # Single whitespaces
$nextlectures = "There are no scheduled lectures.\n"
unless ($nextlectures);
print MESSAGE<<"MSGEND";
On $nextdata{nicedate} at 18:30, Haifux will gather to hear a talk by $nextdata{person_text}:
$nextdata{lecture_text}
Abstract
$nextdata{abstract}
=================================================================
We meet in Taub building, room 6. For instructions see:
http://www.haifux.org/where.html
Attendance is free, and you are all invited!
==================================================================
Future lectures:
$nextlectures
==================================================================
We are always interested in hearing your talks and ideas. If you wish to give a talk, hold a discussion, or just plan some event haifux might be interested in, please contact us at webmaster\@haifux.org
MSGEND
close MESSAGE;
exit; # so we know here is the end of it :)
sub start {
my ($tagname, $attr) = @_;
$td++ if $tagname eq "td";#count the table cell we are in
push @tree, $tagname;
if ($td == 2 and $tagname eq "a") {
$data{lecture_link} = $attr->{href};
}
if ($td == 3 and $tagname eq "a") {
$data{person_link} = $attr->{href};
}
if ($td == 5 and $tagname eq "a") {
$data{comment} .= qq(<a href="$attr->{href}">);
# print "1comment $data{'comment'}";
}
}
sub end {
my ($tagname) = @_;
pop @tree || die "Not symmetric\n";
if ($td == 5 and $tagname eq "a") {
$data{comment} .= qq(</a>);
# print "2comment $data{'comment'}";
}
}
sub text {
my ($text) = @_;
return if not @tree;
#work around - html parser interprets nbsp as char 160,
# it should be ' '
my $nbsp=chr(160);
$text=~s/$nbsp/ /g;
# Eli, 16.2.13. Don't be picky about what's in
's
if ($td == 1 and $tree[-1] eq "td") { $data{id} = $text; }
# if ($td == 2 and $tree[-1] eq "a") { $data{lecture_text} = $text; }
# if ($td == 3 and $tree[-1] eq "a") { $data{person_text} = $text; }
if ($td == 2 and (($tree[-1] eq "a") or ($tree[-1] eq "td")))
{ $data{lecture_text} = $text; }
if ($td == 3 and (($tree[-1] eq "a") or ($tree[-1] eq "td")))
{ $data{person_text} = $text; }
if ($td == 4 and $tree[-1] eq "td" or $tree[-1] eq "div") {
$data{date} = $text;
}
if ($td == 5 and $tree[-1]) { $data{comment} .= $text; }
}
sub xmlescape {
my ($x) = @_;
$x =~ s/([&<>\'\"])/$entities{$1}/ge;
# As an extra service, change all whitespace sequences to a single space
$x =~ s/[ \t\n\r]+/ /g;
($x) = ($x =~ /^[ ]*(.*?)[ ]*$/);
return $x;
}