#!/usr/bin/perl use warnings; use strict; use File::Find; # your full path to /dir/being/scraped/. my $dir = "/usr/www/users/username/includes/archives/2002/"; # lookups for month conversion. no date modules are needed. my %months = ( January => '01', February => '02', March => '03', April => '04', May => '05', June => '06', July => '07', August => '08', September => '09', October => '10', November => '11', December => '12' ); # initiate find/conversion. # one-second sleep command reduces server load find(\&importable_files, $dir); sub importable_files { sleep(1); return unless $_ =~ /\d+.html$/; # files not ##.html # open the file and suck it into an array. open(FILE, "<$File::Find::name") or die $!; my @file = ; close(FILE) or die $!; # holders for our export. my ($title, $date, $body); # loop through each line, looking for good stuff. foreach my $line (@file) { # skip the old anchors that Cam used. next if $line =~ /^<\/A>$/; # we've found a date. if ($line =~ /

(.*)<\/h2>/i) { my $original_date = $1; # randomize a timestamp for this entry. my $h = sprintf "%02.0d", int(rand(12)); my $m = sprintf "%02.0d", int(rand(60)); my $s = sprintf "%02.0d", int(rand(60)); # parse ourselves so we don't require Perl modules. $original_date =~ /(\w+), (\w+) (\d+), (\d{4})/; my ($mo, $d) = ($months{$2}, sprintf("%02.0d", $3)); $date = "$mo/$d/$4 $h:$m:$s PM"; # final date. next; # we don't want this as part of $body. } # any other line, including whitespace, is assumed to # be part of the body, so we suck that all in here. $body .= $line; } $body =~ s/^\n{2}$//; # grab the first five words of body to use as TITLE # first, strip all HTML, then split # into an array on whitespace. count. my $clean_body = strip_html($body); my (@words) = split(/\s/, $clean_body); for (my $i=0; $i <= 5; $i++) { last unless defined($words[$i]); $title .= "$words[$i] "; $title .= "..." if $i == 5; } # remove extra whitespace. $title =~ s/^\s+//; $title =~ s/\s+$//; $body =~ s/\n$//; # ending newline. # and print out content into MT import format. print <]*>//g; return $t; } 1;