Haifux

#!/usr/bin/perl #By Gabor Szabo #Patch by Orna Agmon #Partial rewrite by Eli Billauer (this soup has many cooks) use strict; use warnings; my $VERSION = '0.01'; use HTML::Parser; use POSIX qw(mktime strftime); my $dir = $ARGV[0]; my $publicdir = $ARGV[1]; my $outfile = $ARGV[2]; my $msgfile = $ARGV[3] or die "Usage: $0 future-directory-name public-dir rss2-file announcement-file\n"; my $now = time(); my $pubdate = POSIX::strftime("%a, %d %b %Y %H:%M:%S %z", localtime($now)); my $td; # counting the td elements my %data; # hold all the data about one date my @tree; # stack of current items my %nextdata; my $firstentry = 1; my $nextlectures = ""; my %entities = ( # For escaping '&' => '&', '<' => '<', '>' => '>', "'" => ''', '"' => '"' ); open (OUT, ">$outfile") or die("Failed to open \"$outfile\" for write\n"); open (MESSAGE, ">$msgfile") or die("Failed to open \"$msgfile\" for write\n"); print OUT <<"ENDOFHEADER"; Haifux http://www.haifux.org/ Haifux Linux Club en-us $pubdate Haifux homecooked RSS generator 360 ENDOFHEADER FILE: foreach my $input_file (<$dir/*>) { my %next; next FILE if ($input_file =~ /index\.htm/i); $td = 0; %data = (comment => ''); my $p = HTML::Parser->new( api_version => 3, start_h => [\&start, "tagname, attr"], end_h => [\&end, "tagname"], text_h => [\&text, "dtext"], marked_sections => 1, ); $p->parse_file($input_file); unless (defined $data{lecture_link}) { warn("Warning: Lecture number $data{id} skipped because it doesn't have a lecture link.\n") if (defined $data{id}); next FILE; } unless (open (L, "$publicdir/$data{lecture_link}/index.html")) { warn("Warning: No abstract for lecture number $data{id} because \"$publicdir/$data{lecture_link}/index.html\" could not be opened for read\n"); } else { local $/; # Enable slurp mode. my $lecdata = ; close L; # This regular expression takes the smallest

snipped, which # comes after a . Let's hope it's the abstract. ($data{abstract}) = ($lecdata =~ /<\/h2>.*?

[ \t\n\r]*(.*?)[ \t\n\r]*<\/p>/is); # The first abstract we encounter is the one we encounter first $next{abstract} = $data{abstract} || "To be published."; unless (defined $data{abstract}) { warn("Warning: No abstract was resolved from \"$publicdir/$data{lecture_link}. (Not published on billboards)\".\n"); warn("(I expected to find a and then a

...Text...

)\n"); } } $data{abstract} = "" unless (defined $data{abstract}); $data{abstract} = xmlescape($data{abstract}); foreach my $x (qw [id date lecture_link lecture_text person_text ]) { # If there isn't something meaningful in all entries, it's not # mature for publishing yet (in particular, as this goes to the # Technion bill board database) unless ((defined $data{$x}) && ($data{$x} =~ /[^ \t\n\r]/)) { warn("Warning: Lecture number $data{id} skipped because it doesn't have a $x entry\n"); next FILE; } $next{$x} = $data{$x}; $data{$x} = xmlescape($data{$x}); } my ($day, $month, $year) = ($data{date} =~ /(\d+)\D+(\d+)\D+(\d+)/); unless (defined $year) { warn("Warning: Lecture number $data{id} skipped because its date couldn't be resolved\n"); next FILE; } $year = 2000 + $year if ($year < 100); my $sqldate = sprintf("%4d-%02d-%02d 18:30:00",$year,$month,$day); $next{nicedate} = POSIX::strftime("%A, %B ", 0, 0, 0, $day, $month-1, $year-1900); $next{nicedate} .= $day; if (($day== 1) || ($day== 21) || ($day== 31)) { $next{nicedate} .= 'st'; } elsif (($day == 2) || ($day == 22)) { $next{nicedate} .= 'nd'; } elsif (($day == 3) || ($day == 23)) { $next{nicedate} .= 'rd'; } else { $next{nicedate} .= 'th'; } if ($firstentry) { %nextdata = %next; $firstentry = 0; } else { my $line = sprintf('%02d/%02d/%02d',$day, $month, $year % 100)." $next{lecture_text}: $next{person_text}"; $line =~ s/<.*?>//gs; # Remove HTML tags $line =~ s/[ \t\n\r]+/ /gs; # Single whitespaces $line = join "\n ", ($line =~ /(.{1,68})(?: |$)/g); $nextlectures .= "$line\n"; } print OUT <<"ITEMEND"; $data{date}: $data{lecture_text} ($data{person_text}) http://www.haifux.org/$data{lecture_link} $data{abstract} $pubdate http://www.haifux.org/$data{lecture_link} $sqldate $data{lecture_text} $data{abstract} $data{person_text} http://www.haifux.org/$data{lecture_link} Taub 6 ITEMEND } print OUT "\n\n"; close(OUT); $nextdata{abstract} =~ s/<.*?>//gs; # Remove HTML tags $nextdata{abstract} =~ s/[ \t\n\r]+/ /gs; # Single whitespaces $nextlectures = "There are no scheduled lectures.\n" unless ($nextlectures); print MESSAGE<<"MSGEND"; On $nextdata{nicedate} at 18:30, Haifux will gather to hear a talk by $nextdata{person_text}: $nextdata{lecture_text} Abstract $nextdata{abstract} ================================================================= We meet in Taub building, room 6. For instructions see: http://www.haifux.org/where.html Attendance is free, and you are all invited! ================================================================== Future lectures: $nextlectures ================================================================== We are always interested in hearing your talks and ideas. If you wish to give a talk, hold a discussion, or just plan some event haifux might be interested in, please contact us at webmaster\@haifux.org MSGEND close MESSAGE; exit; # so we know here is the end of it :) sub start { my ($tagname, $attr) = @_; $td++ if $tagname eq "td";#count the table cell we are in push @tree, $tagname; if ($td == 2 and $tagname eq "a") { $data{lecture_link} = $attr->{href}; } if ($td == 3 and $tagname eq "a") { $data{person_link} = $attr->{href}; } if ($td == 5 and $tagname eq "a") { $data{comment} .= qq(<a href="$attr->{href}">); # print "1comment $data{'comment'}"; } } sub end { my ($tagname) = @_; pop @tree || die "Not symmetric\n"; if ($td == 5 and $tagname eq "a") { $data{comment} .= qq(</a>); # print "2comment $data{'comment'}"; } } sub text { my ($text) = @_; return if not @tree; #work around - html parser interprets nbsp as char 160, # it should be ' ' my $nbsp=chr(160); $text=~s/$nbsp/ /g; # Eli, 16.2.13. Don't be picky about what's in 's if ($td == 1 and $tree[-1] eq "td") { $data{id} = $text; } # if ($td == 2 and $tree[-1] eq "a") { $data{lecture_text} = $text; } # if ($td == 3 and $tree[-1] eq "a") { $data{person_text} = $text; } if ($td == 2 and (($tree[-1] eq "a") or ($tree[-1] eq "td"))) { $data{lecture_text} = $text; } if ($td == 3 and (($tree[-1] eq "a") or ($tree[-1] eq "td"))) { $data{person_text} = $text; } if ($td == 4 and $tree[-1] eq "td" or $tree[-1] eq "div") { $data{date} = $text; } if ($td == 5 and $tree[-1]) { $data{comment} .= $text; } } sub xmlescape { my ($x) = @_; $x =~ s/([&<>\'\"])/$entities{$1}/ge; # As an extra service, change all whitespace sequences to a single space $x =~ s/[ \t\n\r]+/ /g; ($x) = ($x =~ /^[ ]*(.*?)[ ]*$/); return $x; }