source: clfs-embedded/BOOK/stylesheets/lfs-xsl/docbook-xsl-snapshot/fo/pdf2index@ cb0e00d

Last change on this file since cb0e00d was 9882b55, checked in by Manuel Canales Esparcia <manuel@…>, 17 years ago

Added the new lfs-xsl stylesheets.

  • Property mode set to 100755
File size: 2.7 KB
Line 
1#!/usr/bin/perl -- # -*- Perl -*-
2
3# this needs some cleanup...
4
5my $PSTOTEXT = "pstotext";
6
7my $pdf = shift @ARGV;
8
9my $index = "";
10my $inindex = 0;
11open (F, "$PSTOTEXT $pdf |");
12while (<F>) {
13 if (/^<\/index/) {
14 $index .= $_;
15 $inindex = 0;
16 }
17 $inindex = 1 if /^<index/;
18
19 if ($inindex) {
20 $index .= $_ if /^\s*</;
21 }
22}
23
24my $cindex = "";
25while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
26 $cindex .= $1;
27 $_ = $2;
28 $index = $'; # '
29
30 my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;
31
32 # Expand ranges
33 if ($#pages >= 0) {
34 my @mpages = ();
35 foreach my $page (@pages) {
36 my $pageno = &pageno($page);
37 if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
38 for (my $count = $1; $count <= $2; $count++) {
39 push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");
40 }
41 } else {
42 push (@mpages, $page);
43 }
44 }
45 @pages = sort rangesort @mpages;
46 }
47
48 # Remove duplicates...
49 if ($#pages > 0) {
50 my @mpages = ();
51 my $current = "";
52 foreach my $page (@pages) {
53 my $pageno = &pageno($page);
54 if ($pageno ne $current) {
55 push (@mpages, $page);
56 $current = $pageno;
57 }
58 }
59 @pages = @mpages;
60 }
61
62 # Collapse ranges...
63 if ($#pages > 1) {
64 my @cpages = ();
65 while (@pages) {
66 my $count = 0;
67 my $len = &rangelen($count, @pages);
68 if ($len <= 2) {
69 my $page = shift @pages;
70 push (@cpages, $page);
71 } else {
72 my $fpage = shift @pages;
73 my $lpage = "";
74 while ($len > 1) {
75 $lpage = shift @pages;
76 $len--;
77 }
78 my $fpno = &pageno($fpage);
79 my $lpno = &pageno($lpage);
80 $fpage =~ s/>$fpno</>${fpno}-$lpno</s;
81 push (@cpages, $fpage);
82 }
83 }
84 @pages = @cpages;
85 }
86
87 my $page = shift @pages;
88 $page =~ s/\s*$//s;
89 $cindex .= $page;
90 while (@pages) {
91 $page = shift @pages;
92 $page =~ s/\s*$//s;
93 $cindex .= ", $page";
94 }
95}
96$cindex .= $index;
97
98print "$cindex\n";
99
100sub pageno {
101 my $page = shift;
102
103 $page =~ s/^<phrase.*?>//;
104 $page =~ s/^<link.*?>//;
105
106 return $1 if $page =~ /^([^<>]+)/;
107 return "?";
108}
109
110sub rangesort {
111 my $apno = &pageno($a);
112 my $bpno = &pageno($b);
113
114 # Make sure roman pages come before arabic ones, otherwise sort them in order
115 return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
116 return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
117 return $apno <=> $bpno;
118}
119
120sub rangelen {
121 my $count = shift;
122 my @pages = @_;
123 my $len = 1;
124 my $inrange = 1;
125
126 my $current = &pageno($pages[$count]);
127 while ($count < $#pages && $inrange) {
128 $count++;
129 my $next = &pageno($pages[$count]);
130 if ($current + 1 eq $next) {
131 $current = $next;
132 $inrange = 1;
133 $len++;
134 } else {
135 $inrange = 0;
136 }
137 }
138
139 return $len;
140}
Note: See TracBrowser for help on using the repository browser.