...
Code Block | ||||
---|---|---|---|---|
| ||||
public class PDFAdapter implements FileLoader, HasRGBPixels, HasText, HasLineGraphics { private File file; private double[][][] pixels; private List<String> words; private List<Path2D> graphics; static public void main(String[] args) { List<Double> weights = new ArrayList<Double>(); List<PairwiseComparison> comparisons = new ArrayList<PairwiseComparison>(); PairwiseComparison comparison = new PairwiseComparison(); comparison.setId(UUID.randomUUID().toString()); comparison.setFirstDataset(new File("data/test1.pdf")); comparison.setSecondDataset(new File("data/test2.pdf")); comparison.setAdapterId(PDFAdapter.class.getName()); comparison.setExtractorId(TextHistogramExtractor.class.getName()); comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName()); comparisons.add(comparison); weights.add(0.7); comparison = new PairwiseComparison(); comparison.setId(UUID.randomUUID().toString()); comparison.setFirstDataset(new File("data/test1.pdf")); comparison.setSecondDataset(new File("data/test2.pdf")); comparison.setAdapterId(PDFAdapter.class.getName()); comparison.setExtractorId(TextHistogramExtractor.class.getName()); comparison.setExtractorId(ImageHistogramExtractor.class.getName()); comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName()); comparison.setMeasureId(LabelHistogramIntersectionDistanceMeasure.class.getName()); comparisons.add(comparison); weights.add(0.2); comparison = new PairwiseComparison(); comparison.setId(UUID.randomUUID().toString()); comparison.setFirstDataset(new File("data/test1.pdf")); comparison.setSecondDataset(new File("data/test2.pdf")); comparison.setAdapterId(PDFAdapter.class.getName()); comparison.setExtractorId(TextHistogramExtractor.class.getName()); comparison.setExtractorId(LineGraphicsHistogramExtractor.class.getName()); comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName()); comparisons.add(comparison); weights.add(0.1); ComprehensiveEngine engine = new ComprehensiveEngine(); Double d = engine.compute(comparisons, weights); System.out.println(d); System.exit(0); ExecutionEngine ee = new ExecutionEngine(); ee.submit(comparison, new ComparisonStatusHandler() { @Override public void onStarted() { System.out.println("STARTED : "); } @Override public void onFailed(String msg, Throwable e) { System.out.println("FAILED : " + msg); e.printStackTrace(); System.exit(0); } @Override public void onDone(double value) { System.out.println("DONE : " + value); System.exit(0); } @Override public void onAborted(String msg) { System.out.println("ABORTED : " + msg); System.exit(0); } }); } public PDFAdapter() { } // ---------------------------------------------------------------------- // FileLoader // ---------------------------------------------------------------------- @Override public void load(File file) { this.file = file; } @Override public String getName() { return "PDF Document"; } @Override public List<String> getSupportedMediaTypes() { List<String> mediaTypes = new ArrayList<String>(); mediaTypes.add("application/pdf"); return mediaTypes; } // ---------------------------------------------------------------------- // HasRGBPixels // ---------------------------------------------------------------------- @Override public double getRGBPixel(int row, int column, int band) { if ((pixels == null) && (getRGBPixels() == null)) { return Double.NaN; } else { return pixels[row][column][band]; } } @Override public double[][][] getRGBPixels() { if (pixels == null) { // create monster array. try { loadImages(); } catch (IOException e) { e.printStackTrace(); return null; } } return pixels; } private void loadImages() throws IOException { PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_IMAGES); // get all images in the pdf document List<PDFObjectImage> images = new ArrayList<PDFObjectImage>(); for (int i = 0; i < parser.getPageCount(); i++) { parser.parse(i); for (PDFObject po : parser.getObjects()) { if (po instanceof PDFObjectImage) { PDFObjectImage poi = (PDFObjectImage) po; images.add(poi); } } } // create a virtual image that is all the images combined // first column is the image number // second column is pixel (col + row*width) // third column is RGB value pixels = new double[images.size()][][]; for (int i = 0; i < images.size(); i++) { PDFObjectImage poi = images.get(i); int w = poi.getImage().getWidth(); int h = poi.getImage().getHeight(); int[] rgb = poi.getImage().getRGB(0, 0, w, h, null, 0, w); pixels[i] = new double[rgb.length][3]; for (int j = 0; j < rgb.length; j++) { pixels[i][j][0] = (rgb[j] & 0xff0000) >> 16; pixels[i][j][1] = (rgb[j] & 0x00ff00) >> 8; pixels[i][j][2] = (rgb[j] & 0x0000ff) >> 0; } } // close the parser parser.close(); } // ---------------------------------------------------------------------- // HasText // ---------------------------------------------------------------------- @Override public List<String> getWords() { if (words == null) { words = new ArrayList<String>(); try { PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_TEXT); PDFGroupingText textgroup = new PDFGroupingText(PDFGroupingText.REMOVE_EMPTY_LINES); for (int i = 0; i < parser.getPageCount(); i++) { parser.parse(i); for (PDFObject po : textgroup.group(parser.getObjects())) { if (po instanceof PDFObjectText) { for (String s : ((PDFObjectText) po).getText().split("\\W+")) { //$NON-NLS-1$ if (!s.isEmpty()) { words.add(s); } } } } } parser.close(); } catch (IOException e) { e.printStackTrace(); } } return words; } // ---------------------------------------------------------------------- // HasLineGraphics // ---------------------------------------------------------------------- @Override public List<Path2D> getLineGraphics() { if (graphics == null) { graphics = new ArrayList<Path2D>(); try { PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_GRAPHICS); PDFGroupingGraphics textgroup = new PDFGroupingGraphics(); for (int i = 0; i < parser.getPageCount(); i++) { parser.parse(i); for (PDFObject po : textgroup.group(parser.getObjects())) { if (po instanceof PDFObjectGraphics) { graphics.add(((PDFObjectGraphics) po).getPath()); } } } parser.close(); } catch (IOException e) { e.printStackTrace(); } } return graphics; } } |
...