
%-processed : %-tesseract.html %-tesseract.txt %-processed.pdf %-processed-smaller.pdf
	echo created $@

# OCR
%-tesseract.html : %-for-ocr.tiff
	tesseract $< $(shell basename $@ .html) -l eng /home/martin/.tesseract.d/hocr

%-tesseract.txt : %-for-ocr.tiff
	tesseract $< $(shell basename $@ .txt)

%-for-ocr.tiff : %-gray.pgm
	pamtotiff $< 1<> $@

%-gray.pgm : %-unpapered.pnm
	ppmtopgm $< > $@


# Tidied PDF
%-processed.pdf : %-unpapered.pnm
	convert $< $@

%-processed-smaller.pdf : %-processed.pdf
	/usr/bin/env gs \
	-sDEVICE=pdfwrite \
	 -dCompatibilityLevel=1.4 \
	 -dPDFSETTINGS=/screen \
	 -dNOPAUSE \
	 -dQUIET \
	 -dBATCH \
	 -sOutputFile=$@ \
	 $<


# unpaper output
UNPAPER_ADDITIONAL_OPTIONS_FILENAME := unpaper-options-additional
UNPAPER_ADDITIONAL_OPTIONS_FROM_DIR := $(shell /usr/bin/test -e ${UNPAPER_ADDITIONAL_OPTIONS_FILENAME} && cat ${UNPAPER_ADDITIONAL_OPTIONS_FILENAME})

%-unpapered.pnm : %.ppm
	unpaper --time --layout single --size a4 $(UNPAPER_ADDITIONAL_OPTIONS_FROM_DIR) $< $@

%.ppm : %.pdf
	pdftoppm -singlefile $< $(@:.ppm=)

%.ppm : %
	cp -a $(@:.ppm=) $<.ppm


input_pdfs = $(wildcard *.pdf)
targets = $(input_pdfs:.pdf=-processed-smaller.pdf)
all : $(targets)

.PRECIOUS : %-processed.pdf %-processed-smaller.pdf %-tesseract.html %-tesseract.txt
