/* PDFAnnotExtractor.java Copyright (C) 2006-2008, 2011, 2012 Heiko Oberdiek This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA This file is part of PDFAnnotExtractor. See README. */ package pax; import java.io.*; import java.text.*; import java.util.*; import org.apache.pdfbox.cos.*; import org.apache.pdfbox.pdfparser.*; import org.apache.pdfbox.pdmodel.*; import org.apache.pdfbox.pdmodel.common.*; import org.apache.pdfbox.pdmodel.interactive.action.*; import org.apache.pdfbox.pdmodel.interactive.action.type.*; import org.apache.pdfbox.pdmodel.interactive.annotation.*; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.*; public class PDFAnnotExtractor implements Constants { protected File inputFile; protected File outputFile; protected PDDocument doc; protected PDDocumentCatalog catalog; protected Entry entry; protected int destCount; protected static final String USAGE = "Syntax: java PDFAnnotExtractor "; protected static final String EXT_PDF = ".pdf"; protected static final String EXT_PAX = ".pax"; public static void main(String[] argv) { if (argv.length < 1) { System.err.println(USAGE); System.exit(1); } for(int i = 0; i < argv.length; i++) { processFile(argv[i]); } } public static void processFile(String fileName) { File file = new File(fileName); if (file.isFile()) { fileName = stripFromEnd(fileName, EXT_PDF); } else { File testFile = new File(fileName + EXT_PDF); if (testFile.isFile()) { file = testFile; } else { System.err.println(USAGE); error("PDF file not found: " + fileName, null); } } System.out.println("* Processing file `" + file.toString() + "' ..."); PDFAnnotExtractor p = new PDFAnnotExtractor(file, new File(fileName + EXT_PAX)); p.parse(); p.close(); } public PDFAnnotExtractor(File inputFile, File outputFile) { this.inputFile = inputFile; this.outputFile = outputFile; destCount = 0; try { doc = PDDocument.load(inputFile); catalog = doc.getDocumentCatalog(); } catch (IOException e) { error("Loading failed: " + inputFile, e); } try { entry = new Entry(new BufferedWriter(new FileWriter(outputFile))); } catch (IOException e) { error("Cannot open output: " + outputFile, e); } } public void close() { try { doc.close(); } catch (IOException e) { error("Closing failed: " + inputFile, e); } try { entry.close(); } catch (EntryWriteException e) { error("Closing failed: " + outputFile, e); } } public void parse() { cmd_pax(); cmd_file(); cmd_pagenum(); cmd_baseurl(); parse_pages(); } public void cmd_pax() { try { // PAX version info entry.setCmd(CMD_PAX); entry.addArg(PAX_VERSION); entry.write(); } catch (Exception e) { handleCmdException(e); } } public void cmd_file() { try { entry.setCmd(CMD_FILE); entry.addArg(formatString(inputFile.toString())); long size = inputFile.length(); if (size > 0) { entry.putKV(KEY_SIZE, "" + size); } long time = inputFile.lastModified(); if (time != 0) { Date d = new Date(time); DateFormat df = new SimpleDateFormat("yyyyMMddHHmmss"); String date = "D:" + df.format(d); df = new SimpleDateFormat("Z"); String zone = df.format(d); if (zone.equals("+0000")) { zone = "Z"; } else { // ISO format: SIGN DIGIT DIGIT DIGIT DIGIT zone = zone.substring(0, 3) + "'" + zone.substring(3, 5) + "'"; } entry.putKV(KEY_DATE, date + zone); } entry.write(); } catch (Exception e) { handleCmdException(e); } } public void cmd_pagenum() { try { // pagenum int pagenum = doc.getNumberOfPages(); entry.setCmd(CMD_PAGENUM); entry.addArg("" + pagenum); entry.write(); } catch (Exception e) { handleCmdException(e); } } public void cmd_info() { try { // info entry.setCmd(CMD_INFO); PDDocumentInformation info = doc.getDocumentInformation(); entry.putKV(KEY_AUTHOR, formatString(info.getAuthor())); entry.putKV(KEY_CREATOR, formatString(info.getCreator())); entry.putKV(KEY_KEYWORDS, formatString(info.getKeywords())); entry.putKV(KEY_PRODUCER, formatString(info.getProducer())); entry.putKV(KEY_SUBJECT, formatString(info.getSubject())); entry.putKV(KEY_TITLE, formatString(info.getTitle())); if (!entry.isEmptyKV()) { entry.write(); } } catch (Exception e) { handleCmdException(e); } } public void cmd_baseurl() { try { // baseurl entry.setCmd(CMD_BASEURL); COSDictionary dict = catalog.getCOSDictionary(); dict = (COSDictionary) dict.getItem(COSName.getPDFName(PDF_URI)); String baseurl = dict.getString(PDF_BASE); if (baseurl.length() > 0) { entry.addArg(formatString(baseurl)); entry.write(); } } catch (EntryWriteException e) { handleCmdException(e); } catch (Exception e) {} } public void parse_pages() { ListIterator iter = catalog.getAllPages().listIterator(); while (iter.hasNext()) { // page PDPage page = (PDPage)iter.next(); int num = iter.nextIndex(); parse_page(num, page); } try { entry.flushDelayed(); } catch (EntryWriteException e) { System.err.println("Write error: " + outputFile); System.err.println(e.getCause().toString()); } } public void parse_page(int num, PDPage page) { cmd_page(num, page); parse_annots(num, page); } public void cmd_page(int num, PDPage page) { try { entry.setCmd(CMD_PAGE); entry.addArg("" + num); entry.withKV(); // rotate entry int rot = page.findRotation(); if (rot != 0) { entry.putKV(KEY_ROTATE, "" + rot); } // box entries PDRectangle mediaBox = page.findMediaBox(); PDRectangle cropBox = page.findCropBox(); PDRectangle bleedBox = page.getBleedBox(); PDRectangle trimBox = page.getTrimBox(); PDRectangle artBox = page.getArtBox(); entry.addArg(formatBox(mediaBox)); if (!equals(cropBox, mediaBox)) { entry.putKV(KEY_CROP_BOX, formatBox(cropBox)); } if (!equals(bleedBox, cropBox)) { entry.putKV(KEY_BLEED_BOX, formatBox(bleedBox)); } if (!equals(trimBox, cropBox)) { entry.putKV(KEY_TRIM_BOX, formatBox(trimBox)); } if (!equals(artBox, cropBox)) { entry.putKV(KEY_ART_BOX, formatBox(artBox)); } entry.write(); } catch (Exception e) { handleCmdException(e); } } public void parse_annots(int num, PDPage page) { // get annotations COSArray annots = null; try { COSDictionary page_dict = page.getCOSDictionary(); annots = (COSArray)page_dict.getDictionaryObject(COSName.ANNOTS); } catch (Exception e) {} if (annots == null) { return; } for (int i = 0; i < annots.size(); i++) { try { COSDictionary annot = (COSDictionary)annots.getObject(i); cmd_annot(num, annot); } catch (Exception e) {} } } public void cmd_annot(int num, COSDictionary annot) { try { entry.setCmd(CMD_ANNOT); entry.addArg("" + num); // Type String type = annot.getNameAsString(COSName.TYPE, PDF_ANNOT); if (!type.equals(PDF_ANNOT)) { throw new Exception("Wrong annotation type: " + type); } // Subtype String subtype = annot.getNameAsString(COSName.SUBTYPE); if (subtype == null) { throw new Exception("Missing annotation subtype."); } if (!subtype.equals(PDF_LINK)) { throw new Exception("Unsupported annotation subtype: " + subtype); } entry.addArg(subtype); entry.withKV(); // Rect COSArray array = (COSArray) annot.getDictionaryObject(PDF_RECT); PDRectangle rect = new PDRectangle(array); entry.addArg(formatBox(rect)); // A COSDictionary a = (COSDictionary) annot.getDictionaryObject(PDF_A); if (a == null) { COSBase cos = annot.getDictionaryObject(PDF_DEST); if (cos == null) { throw new Exception("Unsupported link annotation " + "without action."); } entry.addArg(PDF_GOTO); destCount++; entry.putKV(KEY_DEST_LABEL, "" + destCount); annot_attrs(annot); entry.write(); add_dest(PDDestination.create(cos)); entry.writeDelayed(); return; } PDAction action = PDActionFactory.createAction(a); if (action == null) { // try named action, unsupported by PDFBox if (a.getNameAsString(PDF_S).equals(PDF_NAMED)) { entry.addArg(PDF_NAMED); String name = a.getNameAsString(PDF_N); if (name != null) { entry.putKV(KEY_NAME, name); annot_attrs(annot); entry.write(); return; } } throw new Exception("Unsupported link annotation."); } String actionSubtype = action.getSubType(); entry.addArg(actionSubtype); if (action instanceof PDActionURI) { PDActionURI uri = (PDActionURI)action; entry.putKV(KEY_URI, formatString(uri.getURI())); if (uri.shouldTrackMousePosition()) { entry.putKV(KEY_IS_MAP, null); } } else if (action instanceof PDActionGoTo) { destCount++; entry.putKV(KEY_DEST_LABEL, "" + destCount); annot_attrs(annot); entry.write(); add_dest(((PDActionGoTo)action).getDestination()); entry.writeDelayed(); return; } else if (action instanceof PDActionRemoteGoTo) { PDActionRemoteGoTo gotor = (PDActionRemoteGoTo)action; String file = gotor.getFile().getFile(); if (file == null || file.length() == 0) { throw new Exception("GoToR: missing file"); } entry.putKV(KEY_FILE, formatString(gotor.getFile().getFile())); /* PDDestination d = PDDestination.create(gotor.getD()); String view = "/"; if (d instanceof PDNamedDestination) { entry.putKV(KEY_DEST_NAME, ((PDNamedDestination)d) .getNamedDestination()); } else { if (d instanceof PDPageFitDestination) { PDPageFitDestination p = (PDPageFitDestination)d; view += (p.fitBoundingBox()) ? PDF_FITB : PDF_FIT; } else if (d instanceof PDPageFitHeightDestination) { PDPageFitHeightDestination p = (PDPageFitHeightDestination)d; view += (p.fitBoundingBox()) ? PDF_FITBV : PDF_FITV; view += " "; view += (p.getLeft() == -1) ? PDF_NULL : "" + p.getLeft(); } else if (d instanceof PDPageFitWidthDestination) { PDPageFitWidthDestination p = (PDPageFitWidthDestination)d; view += (p.fitBoundingBox()) ? PDF_FITBH : PDF_FITH; view += " "; view += (p.getTop() == -1) ? PDF_NULL : "" + p.getTop(); } else if (d instanceof PDPageXYZDestination) { PDPageXYZDestination p = (PDPageXYZDestination)d; view += PDF_XYZ; view += " "; view += (p.getLeft() == -1) ? PDF_NULL : "" + p.getLeft(); view += " "; view += (p.getTop() == -1) ? PDF_NULL : "" + p.getTop(); view += " "; view += (p.getZoom() == -1) ? PDF_NULL : "" + p.getZoom(); } else if (d instanceof PDPageFitRectangleDestination) { PDPageFitRectangleDestination p = (PDPageFitRectangleDestination)d; view += PDF_FITR; view += " " + p.getLeft() + " " + p.getBottom(); view += " " + p.getRight() + " " + p.getTop(); } else { throw new Exception("Unknown destination type"); } entry.putKV(KEY_DEST_VIEW, view); } */ COSBase d = gotor.getD(); if (d instanceof COSString) { entry.putKV(KEY_DEST_NAME, formatString(((COSString)d).getString())); } else if (d instanceof COSName) { entry.putKV(KEY_DEST_NAME, formatString(((COSName)d).getName())); } else if (d instanceof COSArray) { COSArray dest = (COSArray)d; int page_num = dest.getInt(0); entry.putKV(KEY_DEST_PAGE, "" + page_num); String view = dest.getName(1); int size = dest.size(); if (view.equals(PDF_FIT) || view.equals(PDF_FITB)) { } else if (view.equals(PDF_FITH) || view.equals(PDF_FITBH) || view.equals(PDF_FITV) || view.equals(PDF_FITBV)) { if (size >= 3) { try { String args = formatNumber( ((COSNumber)dest.getObject(2)).floatValue()); view += " " + args; } catch (Exception e) { size = 0; } } if (size < 3) { if (view.equals(PDF_FITH) || view.equals(PDF_FITV)) { view = PDF_FIT; } else { view = PDF_FITB; } } } else if (view.equals(PDF_XYZ)) { COSBase obj; view += " "; if (size >= 3) { obj = dest.getObject(2); if (obj instanceof COSNumber) { view += formatNumber(((COSNumber)obj).floatValue()); } else { view += PDF_NULL; } } else { view += PDF_NULL; } view += " "; if (size >= 4) { obj = dest.getObject(3); if (obj instanceof COSNumber) { view += formatNumber(((COSNumber)obj).floatValue()); } else { view += PDF_NULL; } } else { view += PDF_NULL; } view += " "; if (size >= 5) { obj = dest.getObject(4); if (obj instanceof COSNumber) { view += formatNumber(((COSNumber)obj).floatValue()); } else { view += PDF_NULL; } } else { view += PDF_NULL; } } else if (view.equals(PDF_FITR)) { view += " " + formatNumber(((COSNumber)dest.getObject(2)).floatValue()) + " " + formatNumber(((COSNumber)dest.getObject(3)).floatValue()) + " " + formatNumber(((COSNumber)dest.getObject(4)).floatValue()) + " " + formatNumber(((COSNumber)dest.getObject(5)).floatValue()); } else { throw new Exception("Unknown destination view type"); } entry.putKV(KEY_DEST_VIEW, "/" + view); } else { throw new Exception("GoToR: unknown dest type"); } } else { throw new Exception("Unsupported link annotation type: " + actionSubtype); } annot_attrs(annot); entry.write(); } catch (EntryWriteException e) { handleCmdException(e); } catch (Exception e) { System.err.println("!!! Warning: Annotation on page " + num + " not recognized!"); System.err.println(" " + e.toString()); String msg = e.getMessage(); Throwable cause = e.getCause(); if (cause != null) { System.err.println(" " + cause.toString()); } if (e instanceof ClassCastException) { e.printStackTrace(); } } } private void add_dest(PDDestination dest) throws Exception { entry.setCmd(CMD_DEST); entry.withKV(); if (dest instanceof PDNamedDestination) { String name = ((PDNamedDestination)dest).getNamedDestination(); Object obj = catalog.getNames().getDests().getValue(name); if (obj instanceof PDDestination) { dest = (PDDestination)obj; } } if (dest instanceof PDPageDestination) { PDPageDestination pd = (PDPageDestination)dest; PDPage page = pd.getPage(); int pagenum = catalog.getAllPages().indexOf(page); if (pagenum < 0) { throw new Exception("Link to unknown page"); } entry.addArg("" + (pagenum + 1)); entry.addArg("" + destCount); if (pd instanceof PDPageFitDestination) { PDPageFitDestination d = (PDPageFitDestination)pd; entry.addArg(d.fitBoundingBox() ? PDF_FITB : PDF_FIT); return; } if (pd instanceof PDPageFitHeightDestination) { PDPageFitHeightDestination d = (PDPageFitHeightDestination)pd; entry.addArg(d.fitBoundingBox() ? PDF_FITBV : PDF_FITV); COSArray a = d.getCOSArray(); if (a.size() > 2) { COSNumber n = (COSNumber)a.getObject(2); if (n != null) { entry.putKV(KEY_DEST_X, formatNumber(n)); } } return; } if (pd instanceof PDPageFitWidthDestination) { PDPageFitWidthDestination d = (PDPageFitWidthDestination)pd; entry.addArg(d.fitBoundingBox() ? PDF_FITBH : PDF_FITH); COSArray a = d.getCOSArray(); if (a.size() > 2) { COSNumber n = (COSNumber)a.getObject(2); if (n != null) { entry.putKV(KEY_DEST_Y, formatNumber(n)); } } return; } if (pd instanceof PDPageFitRectangleDestination) { PDPageFitRectangleDestination d = (PDPageFitRectangleDestination)pd; entry.addArg(PDF_FITR); COSArray a = d.getCOSArray(); int size = a.size(); if (size != 6) { throw new Exception("Rectangle destination without" + " correct number of parameters"); } entry.putKV(KEY_DEST_RECT, formatNumber((COSNumber)a.getObject(2)) + " " + formatNumber((COSNumber)a.getObject(3)) + " " + formatNumber((COSNumber)a.getObject(4)) + " " + formatNumber((COSNumber)a.getObject(5))); return; } if (pd instanceof PDPageXYZDestination) { PDPageXYZDestination d = (PDPageXYZDestination)pd; entry.addArg(PDF_XYZ); COSArray a = d.getCOSArray(); int size = a.size(); COSNumber n; if (size > 2) { n = (COSNumber)a.getObject(2); if (n != null) { entry.putKV(KEY_DEST_X, formatNumber(n)); } } if (size > 3) { n = (COSNumber)a.getObject(3); if (n != null) { entry.putKV(KEY_DEST_Y, formatNumber(n)); } } if (size > 4) { n = (COSNumber)a.getObject(4); if (n != null) { entry.putKV(KEY_DEST_ZOOM, formatNumber(n)); } } return; } } throw new Exception("Unknown destination type"); } protected void annot_attrs(COSDictionary annot) { put_attr(annot, PDF_C, KEY_C); put_attr(annot, PDF_BORDER, KEY_BORDER); put_attr(annot, PDF_BS, KEY_BS); put_attr(annot, PDF_H, KEY_H); } protected void put_attr(COSDictionary annot, String pdfKey, String entryKey) { try { COSBase value = annot.getDictionaryObject(pdfKey); if (value == null) { return; } String str = value.accept(new StringVisitor()).toString(); if (str == null || str.length() == 0) { return; } entry.putKV(entryKey, str); } catch (Exception e) {} } protected boolean equals(PDRectangle a, PDRectangle b) { return a.getLowerLeftX() == b.getLowerLeftX() && a.getLowerLeftY() == b.getLowerLeftY() && a.getUpperRightX() == b.getUpperRightX() && a.getUpperRightY() == b.getUpperRightY(); } protected String formatBox(PDRectangle r) { return formatNumber(r.getLowerLeftX()) + ' ' + formatNumber(r.getLowerLeftY()) + ' ' + formatNumber(r.getUpperRightX()) + ' ' + formatNumber(r.getUpperRightY()); } protected String formatString(String s) { if (s == null || s.length() == 0) { return s; } return HEX_STR_BEG + (new COSString(s)).getHexString() + HEX_STR_END; } protected String formatNumber(float f) { String s = "" + f; return stripFromEnd(s, ".0"); } protected String formatNumber(COSNumber obj) { return formatNumber(obj.floatValue()); } protected static String stripFromEnd(String str, String ext) { if (str.endsWith(ext)) { return str.substring(0, str.length() - ext.length()); } return str; } protected static void error(String msg, Exception e, boolean stackTrace) { System.err.println("!!! Error: " + msg); if (e != null) { if (stackTrace) { e.printStackTrace(); } else { System.err.println(e.toString()); } } System.exit(1); } protected static void error(String msg, Exception e) { error(msg, e, false); } protected void handleCmdException(Exception e) { System.err.println("!!! Error during write of entry `" + entry.getCmd() + "'!\n"); if (e instanceof EntryWriteException) { System.err.println("Write error: " + outputFile); System.err.println(((EntryWriteException)e).getCause().toString()); } else { e.printStackTrace(); } } }