# utf8 fixes
s/fi/fi/g
s/fl/fl/g
+s/ff/ff/g
+s/ffi/ffi/g
s/§/!S/g
s/©/(C)/g
s/—/--/g
s/−/-/g
+s/–/-/g
s/∗/*/g
s/ˆ/^/g
s/〈/</g
s/≥/>=/g
s/≠/!=/g
s/Σ/(Sum)/g
-s/√/sqrt:/g
+s/√/(sqrt)/g
s/π/pi/g
s/∞/(inf)/g
s/ƒ/fl./g
s/⎯/-/g
s/↑/(uparrow)/g
s/↓/(downarrow)/g
-s/↔/(<->)/g
-s/→/(->)/g
+s/↔/<->/g
+s/→/->/g
s/‘/'\''/g
s/’/'\''/g
+s/“/"/g
+s/”/"/g
+s/∼/~/g
# pdftotext layout fixes
s/_ _/__/g
# floats are sometimes broken
last=$0
side=0
}
+/^$/ {
+ nl=nl "\n"
+ next
+}
+# TODO: shift page numbers
+#function inc(x) {
+# if (x ~ /[0-9]/)
+# return x+1
+# if (sub(/viii$/,"ix",x) ||
+# sub(/iii$/,"iv",x) ||
+# sub(/iv$/,"v",x) ||
+# sub(/ix$/,"x",x))
+# return x
+# return x "i"
+#}
/^\(newpage\)/ {
n=split(last,a)
if(side)
else
p=a[n]
side=!side
- print "[page " p "]"
+# if (p !~ /[0-9]/ && $0 ~ /INTERNATIONAL STANDARD/)
+# p=0
+# print "\n[page " inc(p) "]"
+ print "\n[page " p "]"
getline
getline
last=$0
}
{
print last
- last=$0
-}
-'
+ last=nl $0
+ nl=""
+}'