| [f8c4e94] | 1 | #!/usr/bin/perl -- # -*- Perl -*-
 | 
|---|
 | 2 | 
 | 
|---|
 | 3 | # this needs some cleanup...
 | 
|---|
 | 4 | 
 | 
|---|
 | 5 | my $PSTOTEXT = "pstotext";
 | 
|---|
 | 6 | 
 | 
|---|
 | 7 | my $pdf = shift @ARGV;
 | 
|---|
 | 8 | 
 | 
|---|
 | 9 | my $index = "";
 | 
|---|
 | 10 | my $inindex = 0;
 | 
|---|
 | 11 | open (F, "$PSTOTEXT $pdf |");
 | 
|---|
 | 12 | while (<F>) {
 | 
|---|
 | 13 |     if (/^<\/index/) {
 | 
|---|
 | 14 |         $index .= $_;
 | 
|---|
 | 15 |         $inindex = 0;
 | 
|---|
 | 16 |     }
 | 
|---|
 | 17 |     $inindex = 1 if /^<index/;
 | 
|---|
 | 18 | 
 | 
|---|
 | 19 |     if ($inindex) {
 | 
|---|
 | 20 |         $index .= $_ if /^\s*</;
 | 
|---|
 | 21 |     }
 | 
|---|
 | 22 | }
 | 
|---|
 | 23 | 
 | 
|---|
 | 24 | my $cindex = "";
 | 
|---|
 | 25 | while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
 | 
|---|
 | 26 |     $cindex .= $1;
 | 
|---|
 | 27 |     $_ = $2;
 | 
|---|
 | 28 |     $index = $'; # '
 | 
|---|
 | 29 | 
 | 
|---|
 | 30 |     my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;
 | 
|---|
 | 31 | 
 | 
|---|
 | 32 |     # Expand ranges
 | 
|---|
 | 33 |     if ($#pages >= 0) {
 | 
|---|
 | 34 |         my @mpages = ();
 | 
|---|
 | 35 |         foreach my $page (@pages) {
 | 
|---|
 | 36 |             my $pageno = &pageno($page);
 | 
|---|
 | 37 |             if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
 | 
|---|
 | 38 |                 for (my $count = $1; $count <= $2; $count++) {
 | 
|---|
 | 39 |                     push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");
 | 
|---|
 | 40 |                 }
 | 
|---|
 | 41 |             } else {
 | 
|---|
 | 42 |                 push (@mpages, $page);
 | 
|---|
 | 43 |             }
 | 
|---|
 | 44 |         }
 | 
|---|
 | 45 |         @pages = sort rangesort @mpages;
 | 
|---|
 | 46 |     }
 | 
|---|
 | 47 | 
 | 
|---|
 | 48 |     # Remove duplicates...
 | 
|---|
 | 49 |     if ($#pages > 0) {
 | 
|---|
 | 50 |         my @mpages = ();
 | 
|---|
 | 51 |         my $current = "";
 | 
|---|
 | 52 |         foreach my $page (@pages) {
 | 
|---|
 | 53 |             my $pageno = &pageno($page);
 | 
|---|
 | 54 |             if ($pageno ne $current) {
 | 
|---|
 | 55 |                 push (@mpages, $page);
 | 
|---|
 | 56 |                 $current = $pageno;
 | 
|---|
 | 57 |             }
 | 
|---|
 | 58 |         }
 | 
|---|
 | 59 |         @pages = @mpages;
 | 
|---|
 | 60 |     }
 | 
|---|
 | 61 | 
 | 
|---|
 | 62 |     # Collapse ranges...
 | 
|---|
 | 63 |     if ($#pages > 1) {
 | 
|---|
 | 64 |         my @cpages = ();
 | 
|---|
 | 65 |         while (@pages) {
 | 
|---|
 | 66 |             my $count = 0;
 | 
|---|
 | 67 |             my $len = &rangelen($count, @pages);
 | 
|---|
 | 68 |             if ($len <= 2) {
 | 
|---|
 | 69 |                 my $page = shift @pages;
 | 
|---|
 | 70 |                 push (@cpages, $page);
 | 
|---|
 | 71 |             } else {
 | 
|---|
 | 72 |                 my $fpage = shift @pages;
 | 
|---|
 | 73 |                 my $lpage = "";
 | 
|---|
 | 74 |                 while ($len > 1) {
 | 
|---|
 | 75 |                     $lpage = shift @pages;
 | 
|---|
 | 76 |                     $len--;
 | 
|---|
 | 77 |                 }
 | 
|---|
 | 78 |                 my $fpno = &pageno($fpage);
 | 
|---|
 | 79 |                 my $lpno = &pageno($lpage);
 | 
|---|
 | 80 |                 $fpage =~ s/>$fpno</>${fpno}-$lpno</s;
 | 
|---|
 | 81 |                 push (@cpages, $fpage);
 | 
|---|
 | 82 |             }
 | 
|---|
 | 83 |         }
 | 
|---|
 | 84 |         @pages = @cpages;
 | 
|---|
 | 85 |     }
 | 
|---|
 | 86 | 
 | 
|---|
 | 87 |     my $page = shift @pages;
 | 
|---|
 | 88 |     $page =~ s/\s*$//s;
 | 
|---|
 | 89 |     $cindex .= $page;
 | 
|---|
 | 90 |     while (@pages) {
 | 
|---|
 | 91 |         $page = shift @pages;
 | 
|---|
 | 92 |         $page =~ s/\s*$//s;
 | 
|---|
 | 93 |         $cindex .= ", $page";
 | 
|---|
 | 94 |     }
 | 
|---|
 | 95 | }
 | 
|---|
 | 96 | $cindex .= $index;
 | 
|---|
 | 97 | 
 | 
|---|
 | 98 | print "$cindex\n";
 | 
|---|
 | 99 | 
 | 
|---|
 | 100 | sub pageno {
 | 
|---|
 | 101 |     my $page = shift;
 | 
|---|
 | 102 | 
 | 
|---|
 | 103 |     $page =~ s/^<phrase.*?>//;
 | 
|---|
 | 104 |     $page =~ s/^<link.*?>//;
 | 
|---|
 | 105 | 
 | 
|---|
 | 106 |     return $1 if $page =~ /^([^<>]+)/;
 | 
|---|
 | 107 |     return "?";
 | 
|---|
 | 108 | }
 | 
|---|
 | 109 | 
 | 
|---|
 | 110 | sub rangesort {
 | 
|---|
 | 111 |     my $apno = &pageno($a);
 | 
|---|
 | 112 |     my $bpno = &pageno($b);
 | 
|---|
 | 113 | 
 | 
|---|
 | 114 |     # Make sure roman pages come before arabic ones, otherwise sort them in order
 | 
|---|
 | 115 |     return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
 | 
|---|
 | 116 |     return  1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
 | 
|---|
 | 117 |     return $apno <=> $bpno;
 | 
|---|
 | 118 | }
 | 
|---|
 | 119 | 
 | 
|---|
 | 120 | sub rangelen {
 | 
|---|
 | 121 |     my $count = shift;
 | 
|---|
 | 122 |     my @pages = @_;
 | 
|---|
 | 123 |     my $len = 1;
 | 
|---|
 | 124 |     my $inrange = 1;
 | 
|---|
 | 125 | 
 | 
|---|
 | 126 |     my $current = &pageno($pages[$count]);
 | 
|---|
 | 127 |     while ($count < $#pages && $inrange) {
 | 
|---|
 | 128 |         $count++;
 | 
|---|
 | 129 |         my $next = &pageno($pages[$count]);
 | 
|---|
 | 130 |         if ($current + 1 eq $next) {
 | 
|---|
 | 131 |             $current = $next;
 | 
|---|
 | 132 |             $inrange = 1;
 | 
|---|
 | 133 |             $len++;
 | 
|---|
 | 134 |         } else {
 | 
|---|
 | 135 |             $inrange = 0;
 | 
|---|
 | 136 |         }
 | 
|---|
 | 137 |     }
 | 
|---|
 | 138 | 
 | 
|---|
 | 139 |     return $len;
 | 
|---|
 | 140 | }
 | 
|---|