Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
languagejava
titleAdapter
 
Code Block
languagejava
titleExtractor
 
Code Block
languagejava
titleMeasure
public public class WordCountMeasurePDFAdapter implements FileLoader, HasRGBPixels, HasText, HasLineGraphics {
    private File         file;
    private double[][][] pixels;
    private List<String> words;
    private List<Path2D> graphics;

    static public void main(String[] args) {
        List<Double> weights = new ArrayList<Double>();
        List<PairwiseComparison> comparisons = new ArrayList<PairwiseComparison>();
        
        PairwiseComparison comparison = new PairwiseComparison();
        comparison.setId(UUID.randomUUID().toString());
        comparison.setFirstDataset(new File("data/test1.pdf"));
        comparison.setSecondDataset(new File("data/test2.pdf"));
        comparison.setAdapterId(PDFAdapter.class.getName());
        comparison.setExtractorId(TextHistogramExtractor.class.getName());
        // comparison.setExtractorId(ImageHistogramExtractor.class.getName());
        //comparison.setExtractorId(LineGraphicsHistogramExtractor.class.getName());
        // comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
        comparison.setMeasureId(LabelHistogramIntersectionDistanceMeasure.class.getName());
        comparisons.add(comparison);
        weights.add(0.7);
        
        comparison = new PairwiseComparison();
        comparison.setId(UUID.randomUUID().toString());
        comparison.setFirstDataset(new File("data/test1.pdf"));
        comparison.setSecondDataset(new File("data/test2.pdf"));
        comparison.setAdapterId(PDFAdapter.class.getName());
        comparison.setExtractorId(TextHistogramExtractor.class.getName());
        comparison.setExtractorId(ImageHistogramExtractor.class.getName());
        comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
        comparison.setMeasureId(LabelHistogramIntersectionDistanceMeasure.class.getName());
        comparisons.add(comparison);
        weights.add(0.2);
        
        comparison = new PairwiseComparison();
        comparison.setId(UUID.randomUUID().toString());
        comparison.setFirstDataset(new File("data/test1.pdf"));
        comparison.setSecondDataset(new File("data/test2.pdf"));
        comparison.setAdapterId(PDFAdapter.class.getName());
        comparison.setExtractorId(TextHistogramExtractor.class.getName());
        comparison.setExtractorId(LineGraphicsHistogramExtractor.class.getName());
        comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
        comparisons.add(comparison);
        weights.add(0.1);

        ComprehensiveEngine engine = new ComprehensiveEngine();
        Double d = engine.compute(comparisons, weights);
        System.out.println(d);
        System.exit(0);
        
        ExecutionEngine ee = new ExecutionEngine();
        ee.submit(comparison, new ComparisonStatusHandler() {
            @Override
            public void onStarted() {
                System.out.println("STARTED : ");
            }

            @Override
            public void onFailed(String msg, Throwable e) {
                System.out.println("FAILED  : " + msg);
                e.printStackTrace();
                System.exit(0);
            }

            @Override
            public void onDone(double value) {
                System.out.println("DONE    : " + value);
                System.exit(0);
            }

            @Override
            public void onAborted(String msg) {
                System.out.println("ABORTED : " + msg);
                System.exit(0);
            }
        });
    }

    public PDFAdapter() {
    }

    // ----------------------------------------------------------------------
    // FileLoader
    // ----------------------------------------------------------------------

    @Override
    public void load(File file) {
        this.file = file;
    }

    @Override
    public String getName() {
        return "PDF Document";
    }

    @Override
    public List<String> getSupportedMediaTypes() {
        List<String> mediaTypes = new ArrayList<String>();
        mediaTypes.add("application/pdf");
        return mediaTypes;
    }

    // ----------------------------------------------------------------------
    // HasRGBPixels
    // ----------------------------------------------------------------------

    @Override
    public double getRGBPixel(int row, int column, int band) {
        if ((pixels == null) && (getRGBPixels() == null)) {
            return Double.NaN;
        } else {
            return pixels[row][column][band];
        }
    }

    @Override
    public double[][][] getRGBPixels() {
        if (pixels == null) {
            // create monster array.
            try {
                loadImages();
            } catch (IOException e) {
                e.printStackTrace();
                return null;
            }
        }
        return pixels;
    }

    private void loadImages() throws IOException {
        PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_IMAGES);

        // get all images in the pdf document
        List<PDFObjectImage> images = new ArrayList<PDFObjectImage>();
        for (int i = 0; i < parser.getPageCount(); i++) {
            parser.parse(i);
            for (PDFObject po : parser.getObjects()) {
                if (po instanceof PDFObjectImage) {
                    PDFObjectImage poi = (PDFObjectImage) po;
                    images.add(poi);
                }
            }
        }

        // create a virtual image that is all the images combined
        // first column is the image number
        // second column is pixel (col + row*width)
        // third column is RGB value
        pixels = new double[images.size()][][];
        for (int i = 0; i < images.size(); i++) {
            PDFObjectImage poi = images.get(i);
            int w = poi.getImage().getWidth();
            int h = poi.getImage().getHeight();
            int[] rgb = poi.getImage().getRGB(0, 0, w, h, null, 0, w);
            pixels[i] = new double[rgb.length][3];
            for (int j = 0; j < rgb.length; j++) {
                pixels[i][j][0] = (rgb[j] & 0xff0000) >> 16;
                pixels[i][j][1] = (rgb[j] & 0x00ff00) >> 8;
                pixels[i][j][2] = (rgb[j] & 0x0000ff) >> 0;
            }
        }

        // close the parser
        parser.close();
    }

    // ----------------------------------------------------------------------
    // HasText
    // ----------------------------------------------------------------------

    @Override
    public List<String> getWords() {
        if (words == null) {
            words = new ArrayList<String>();

            try {
                PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_TEXT);
                PDFGroupingText textgroup = new PDFGroupingText(PDFGroupingText.REMOVE_EMPTY_LINES);

                for (int i = 0; i < parser.getPageCount(); i++) {
                    parser.parse(i);
                    for (PDFObject po : textgroup.group(parser.getObjects())) {
                        if (po instanceof PDFObjectText) {
                            for (String s : ((PDFObjectText) po).getText().split("\\W+")) { //$NON-NLS-1$
                                if (!s.isEmpty()) {
                                    words.add(s);
                                }
                            }
                        }
                    }
                }

                parser.close();
            } catch (IOException e) {
                e.printStackTrace();
            }

        }

        return words;
    }

    // ----------------------------------------------------------------------
    // HasLineGraphics
    // ----------------------------------------------------------------------

    @Override
    public List<Path2D> getLineGraphics() {
        if (graphics == null) {
            graphics = new ArrayList<Path2D>();

            try {
                PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_GRAPHICS);
                PDFGroupingGraphics textgroup = new PDFGroupingGraphics();

                for (int i = 0; i < parser.getPageCount(); i++) {
                    parser.parse(i);
                    for (PDFObject po : textgroup.group(parser.getObjects())) {
                        if (po instanceof PDFObjectGraphics) {
                            graphics.add(((PDFObjectGraphics) po).getPath());
                        }
                    }
                }

                parser.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return graphics;
    }
}
Code Block
languagejava
titleExtractor
 public class TextHistogramExtractor implements Extractor {
    @Override
    public Adapter newAdapter() {
        throw (new RuntimeException("Not supported."));
    }

    @Override
    public String getName() {
        return "Text Histogram Extractor";
    }

    @Override
    public Set<Class<? extends Adapter>> supportedAdapters() {
        Set<Class<? extends Adapter>> adapters = new HashSet<Class<? extends Adapter>>();
        adapters.add(HasText.class);
        return adapters;
    }

    @Override
    public Class<? extends Descriptor> getFeatureType() {
        return LabelHistogramDescriptor.class;
    }

    @Override
    public Descriptor extract(Adapter adapter) throws Exception {
        if (adapter instanceof HasText) {
            LabelHistogramDescriptor desc = new LabelHistogramDescriptor();
            for (String word : ((HasText) adapter).getWords()) {
                desc.increaseBin(word);
            }
            return desc;
        } else {
            throw new UnsupportedTypeException();
        }
    }
    
    @Override
    public boolean hasPreview(){
        return false;
    }
    
    @Override
    public String previewName(){
        return null;
    }
}
Code Block
languagejava
titleMeasure
public class LabelHistogramEuclidianDistanceMeasure implements Measure {

    @Override
    public SimilarityPercentage normalize(Similarity similarity) {
        return new SimilarityPercentage(1 - similarity.getValue());
    }

    @Override
    public String getFeatureType() {
        return LabelHistogramDescriptor.class.getName();
    }

    @Override
    public String getName() {
        return "Histogram Distance";
    }

    @Override
    public Class<LabelHistogramEuclidianDistanceMeasure> getType() {
        return LabelHistogramEuclidianDistanceMeasure.class;
    }

    // correlation

    @Override
    public Similarity compare(Descriptor desc1, Descriptor desc2) throws Exception {
        if ((desc1 instanceof LabelHistogramDescriptor) && (desc2 instanceof LabelHistogramDescriptor)) {
            LabelHistogramDescriptor lhd1 = (LabelHistogramDescriptor) desc1;
            LabelHistogramDescriptor lhd2 = (LabelHistogramDescriptor) desc2;

            // get all possible labels
            Set<String> labels = new HashSet<String>();
            labels.addAll(lhd1.getLabels());
            labels.addAll(lhd2.getLabels());

            // normalize
            lhd1.normalize();
            lhd2.normalize();
            
            // compute distance
            double sum = 0;
            for (String s : labels) {
                Double b1 = lhd1.getBin(s);
                Double b2 = lhd2.getBin(s);
                if (b1 == null) {
                    sum += b2 * b2;
                } else if (b2 == null) {
                    sum += b1 * b1;
                } else {
                    sum += (b1 - b2) * (b1 - b2);
                }
            }

            return new SimilarityNumber(Math.sqrt(sum), 0, 1, 0);
        } else {
            throw new UnsupportedTypeException();
        }
    Serializable, Measure
{
	private static final long SLEEP = 10000;

	@Override
	public Similarity compare(Descriptor feature1, Descriptor feature2) throws Exception {
		Thread.sleep(SLEEP);
		return new SimilarityNumber(0);
	}

	@Override
	public SimilarityPercentage normalize(Similarity similarity) {
		return null;
	}

	@Override
	public String getFeatureType() {
		return WordCountMeasure.class.getName();
	}

	@Override
	public String getName() {
		return "Word Count Measure";
	}

	@Override
	public Class<WordCountMeasure> getType() {
		return WordCountMeasure.class;
	}
}