#!/opt/perl-5.004/bin/perl -w
#
# $Id: maketreedb.pl,v 1.2 1997/06/20 08:17:56 gnats Exp $
#
#
# DESCRIPTION: maketreedb.pl - index documents for the virtual searchtree
#
# URL: none yet
#
# AUTHOR: Cord Beermann (cord@Wunder-Nett.org)
#
# Thanks to: Ralf Begemann (begemann@cc.fh-lippe.de)
#	     Gerold Meerkoetter (gerold@noc.fh-lippe.de)
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
#
# A Perl script is "correct" if it gets the job done before your boss fires
# you.
# ('Programming Perl Second Edition' by Larry Wall, Tom Christiansen
#   & Randal. L. Schwartz)
#
# Bugs and shortcomings
# none yet
#
# Todo
# * write instructions
#
### Configuration ###
#
$WEBROOT="/var/infosystems/www/webroot";
$DBROOT="/fhl/dvz/loesungen";
$FILE_TITLE_DB="$WEBROOT$DBROOT/.file_title.db";
$KW_FILE_DB="$WEBROOT$DBROOT/.kw_file.db";
$FILE_KW_DB="$WEBROOT$DBROOT/.file_kw.db";
#
### End of Configuration ###

# Initialize Variables
use vars qw ! $WEBROOT $DBROOT $DESTFILE %kwdb %fdb ! ;

# include perl-modules
use POSIX;
use GDBM_File;

# open the db-files
# kwfiledb: kw --> file
tie %kw_file_db, 'GDBM_File', $KW_FILE_DB, O_CREAT|O_RDWR, 0644;
undef(%kw_file_db);

# filekwdb: file --> kw
tie %file_kw_db, GDBM_File, $FILE_KW_DB, O_CREAT|O_RDWR, 0644;
undef(%file_kw_db);

# fdb: file --> title
tie %file_title_db, GDBM_File, $FILE_TITLE_DB, O_CREAT|O_RDWR, 0644;
undef(%file_title_db);

# Loop for each file
foreach $file (<$WEBROOT$DBROOT/*/*.html>) {
    my(@keywords, $kw);
    open (FILE, "$file") || warn("open $file: $!");
    LOOP: while (defined($content = <FILE>)) {
	# store all after <head>-Tag
	if ($content =~ s#.*\<head\>##i) {
	    # if </head> is in the same line, exit LOOP
	    if ($content =~ s#\</head\>.*##i) {
		last LOOP;
	    } else {
		# read next line		
		while ($content .= <FILE>) {
		    # if </head> is in it, exit LOOP 
		    if ($content =~ s#\</head\>.*##i) {
			last LOOP;
		    }
		}
	    }
	}
    }
    # strip trailing, tailing and double spaces, LineFeeds .
    $content =~ s/\s+/ /g;
    $content =~ s/(^\s+|\s+$)//;
    $title = $content;
    # if there is no title, print a warning
    unless ($title =~ s#.*\<TITLE\>([^\>\<]+)\</TITLE\>.*#$1#i) {
	warn("$file: no title found\n");
	next;
    }
    # if there are no keywords, print a warning
    # else throw all away except for the keywords
    unless ($content =~ m#.*\<META NAME=\"keywords\" CONTENT=\"\s*([^\"]+)\s*\">.*#i) {
	warn("$file: no keywords found\n");
	next;
    }
    # load array with keywords    
    @keywords = split(/\s*,\s*/, $1);
    $file =~ s#$WEBROOT$DBROOT/##;
    # Filenames with embedded spaces won't work, print a warning
    if($file =~ m/ /) {
	warn("Space in filename \"$file\" - skipped");
	next;
    }
    
    # Loop for each keyword
    foreach $kw (@keywords) {
#	if(defined($kw_file_db{$kw}) and  $kw_file_db{$kw} !~ /\b$kw\b/) {
#	warn(length($kw_file_db{$kw})+length($file)+length($kw));
	if(defined($kw_file_db{$kw})) {
	    $kw_file_db{$kw} .= " $file";
	} else {
	    $kw_file_db{$kw} = $file;
	}
	if(defined($file_kw_db{$file}) and  $file_kw_db{$file} !~ /\b$file\b/) {
	    $file_kw_db{$file} .= " $kw";
	} else {
	    $file_kw_db{$file} = $kw;
	}
    }
    $file_title_db{$file} = $title;
}

# close files
untie(%kw_file_db);
untie(%file_title_db);
untie(%file_kw_db);

