add new c1x standard draft (n1570), +pdf2html.sh script
[c-standard] / translit.sh
1 #!/bin/sh
2
3 # assumes utf8 locale..
4 # remove nonascii from the output of pdftotext -layout standard.pdf
5
6 sed '
7 s/\f/(newpage)/g
8 # utf8 fixes
9 s/fi/fi/g
10 s/fl/fl/g
11 s/§/!S/g
12 s/©/(C)/g
13 s/—/--/g
14 s/−/-/g
15 s/∗/*/g
16 s/ˆ/^/g
17 s/〈/</g
18 s/〉/>/g
19 s/⎡/[^/g
20 s/⎤/^]/g
21 s/⎣/[_/g
22 s/⎦/_]/g
23 s/⎢/[ /g
24 s/⎥/ ]/g
25 s/⎧/{/g
26 s/⎨/{/g
27 s/⎩/{/g
28 s/±/(+-)/g
29 s/≤/<=/g
30 s/≥/>=/g
31 s/≠/!=/g
32 s/Σ/(Sum)/g
33 s/√/(sqrt)/g
34 s/π/pi/g
35 s/∞/(inf)/g
36 s/ƒ/fl./g
37 s/∫/(integral)/g
38 s/Γ/(Gamma)/g
39 s/×/x/g
40 s/•/o/g
41 s/⎯/-/g
42 s/↑/(uparrow)/g
43 s/↓/(downarrow)/g
44 s/↔/<->/g
45 s/→/->/g
46 s/‘/'\''/g
47 s/’/'\''/g
48 # pdftotext layout fixes
49 s/_ _/__/g
50 # floats are sometimes broken
51 s/\([0-9]\)\. \([0-9]\)/\1.\2/g
52 ' | LC_ALL=C tr -c '\n-~' '?' | awk '
53 BEGIN {
54         getline
55         last=$0
56         side=0
57 }
58 /^$/ {
59         nl=nl "\n"
60         next
61 }
62 # TODO: shift page numbers
63 #function inc(x) {
64 #       if (x ~ /[0-9]/)
65 #               return x+1
66 #       if (sub(/viii$/,"ix",x) ||
67 #           sub(/iii$/,"iv",x) ||
68 #           sub(/iv$/,"v",x) ||
69 #           sub(/ix$/,"x",x))
70 #               return x
71 #       return x "i"
72 #}
73 /^\(newpage\)/ {
74         n=split(last,a)
75         if(side)
76                 p=a[1]
77         else
78                 p=a[n]
79         side=!side
80 #       if (p !~ /[0-9]/ && $0 ~ /INTERNATIONAL STANDARD/)
81 #               p=0
82 #       print "\n[page " inc(p) "]"
83         print "\n[page " p "]"
84         getline
85         getline
86         last=$0
87         next
88 }
89 {
90         print last
91         last=nl $0
92         nl=""
93 }'