pdftotext
[c-standard] / translit.sh
1 #!/bin/sh
2
3 # assumes utf8 locale..
4 # remove nonascii from the output of pdftotext -layout standard.pdf
5
6 sed '
7 s/\f/(newpage)/g
8 # utf8 fixes
9 s/fi/fi/g
10 s/fl/fl/g
11 s/§/!S/g
12 s/©/(C)/g
13 s/—/--/g
14 s/−/-/g
15 s/∗/*/g
16 s/ˆ/^/g
17 s/〈/</g
18 s/〉/>/g
19 s/⎡/[^/g
20 s/⎤/^]/g
21 s/⎣/[_/g
22 s/⎦/_]/g
23 s/⎢/[ /g
24 s/⎥/ ]/g
25 s/⎧/{/g
26 s/⎨/{/g
27 s/⎩/{/g
28 s/±/(+-)/g
29 s/≤/<=/g
30 s/≥/>=/g
31 s/≠/!=/g
32 s/Σ/(Sum)/g
33 s/√/sqrt:/g
34 s/π/pi/g
35 s/∞/(inf)/g
36 s/ƒ/fl./g
37 s/∫/(integral)/g
38 s/Γ/(Gamma)/g
39 s/×/x/g
40 s/•/o/g
41 s/⎯/-/g
42 s/↑/(uparrow)/g
43 s/↓/(downarrow)/g
44 s/↔/(<->)/g
45 s/→/(->)/g
46 s/‘/'\''/g
47 s/’/'\''/g
48 # pdftotext layout fixes
49 s/_ _/__/g
50 s/\([0-9]\). \([0-9]\)/\1.\2/g
51 ' | LC_ALL=C tr -c '\n-~' '?' | awk '
52 BEGIN {
53         getline
54         last=$0
55         side=0
56 }
57 /^\(newpage\)/ {
58         n=split(last,a)
59         if(side)
60                 p=a[1]
61         else
62                 p=a[n]
63         side=!side
64         print "[page " p "]"
65         getline
66         getline
67         last=$0
68         next
69 }
70 {
71         print last
72         last=$0
73 }
74 '