#!/usr/bin/perl # massage B.E. ciders XML files # MUST RUN ON SOLARIS TO DEAL WITH UNICODE ISSUES print "\ncdl-ciders v1.1.0\n\n"; print "C F Baum Apr 2002\n\n"; use LWP::Simple; use Text::Wrap; $str = get("http://repositories.cdlib.org/cgi/oai.cgi?verb=ListRecords&metadataPrefix=repec&set=publication:6369646572"); $out[0]=">ciders.rdf"; $jnl[0]="Center for International and Development Economics Research, Working Paper Series"; $han[0]="ciders"; open(OUTA,$out[0]) || die "Cannot open $outa for output"; $nrtem=0; $str =~ s/\n/ /ig; $str =~ s/\r/ /ig; $str =~ s/"/"/ig; $str =~ s/&/&/ig; $str =~ s/“/"/ig; $str =~ s/”/"/ig; $str =~ s/–/:/ig; $str =~ s/—/--/ig; $str =~ s/^.+?//; (@str) = split("",$str); $nstr=@str; print "\n $nstr articles...\n\n"; $debug=1; foreach $str (@str) { $str=&utf8_to_iso8859($str); ($id) = ($str =~ /oai:cdlib1:(.+?)<\/identifier/); ($credt) = ($str =~ /(.+?)<\/datestamp/); ($pubyr) = ($credt =~ /^(\d\d\d\d)/); ($ti) = ($str =~ /(.+?)<\/ti/); (@au) = ($str =~ /(.+?)<\/au/ig); $nau = @au; ($ab) = ($str =~ /(.+?)<\/ab/); ($jnl) = ($str =~ /(.+?)<\/jrnti/); ($vol) = ($str =~ /(.+?)<\/vol/); # ($iss) = ($str =~ /(.+?)<\/iss/); ($ppf) = ($str =~ /(.+?)<\/ppf/); ($ppl) = ($str =~ /(.+?)<\/ppl/); ($url) = ($str =~ /(.+?)<\/url/); $wh = 0; $jj="A"; # print "$id $jnl \n"; (@kw) = ($str =~ /(.+?)<\/kwd/ig); $nkw = @kw; ($jel) = ($str =~ /(.+?)<\/categ/); $oo = "OUT".$jj; $jt = $jnl[$wh]; $hl = $han[$wh]; print $oo "\nTemplate-Type: ReDIF-Paper 1.0\n"; print $oo "Title: $ti\n"; foreach $i (@au) { ($f) = ($i =~ m/fnm>(.+?)<\/fnm/ig); ($l) = ($i =~ m/snm>(.+?)<\/snm/ig); ($afl) = ($i =~ m/affl>(.+?)<\/affl/ig); print $oo "Author-Name: $f $l \n"; print $oo "Author-Workplace-Name: $afl \n"; } if (length $ab > 1) { print $oo "Abstract:\n"; print $oo wrap(" "," ",$ab); print $oo "\n"; } if ($nkw > 0) { print $oo "Keywords: "; foreach $k (@kw) { print $oo "$k, "; } print $oo "\n"; } print $oo "Classification-JEL: $jel \n"; print $oo "Series: $jt \n"; print $oo "Number: $vol$ppf \n"; print $oo "Creation-Date: $credt \n"; # print $oo "Volume: $vol \n"; # print $oo "Pages: $ppf-$ppl \n"; print $oo "Note: oai:cdlib1:$id\n"; print $oo "File-URL: $url \n"; print $oo "File-Format: application/pdf \n"; print $oo "Handle: RePEc:cdl:$hl:$vol$ppf \n"; $nrtem++; } print " \n$nrtem templates processed... \n\n"; exit; sub utf8_to_iso8859 { # Converts UTF-8 as long as it just encodes ISO-8859-1 my ($str) = @_; my $retval; my @chars = split(//,$str); while (@chars) { my $ch = shift(@chars); my $val1 = ord($ch); if($val1 >= 128) { # XXXX could add error checking to make sure there's a char available. my $val2 = ord(shift(@chars)); $val1 = ($val1 & 31) << 6; $val2 &= 63; my $realch = chr($val1 + $val2); print STDERR "Hello: converted upper ascii char $realch(", ord($realch), ")\n" if ($debug); $retval .= $realch; } else { $retval .= $ch; } } return $retval; } __END__