Page History

...

Code Block

language	java
title	PDF Adapter

public class PDFAdapterTextAdapter implements FileLoader, HasRGBPixels, HasText, HasLineGraphics
{
    private File         file;
    private double[][][] pixels;
    private List<String> words;
    private List<Path2D> graphics;

    static public void main(String[] args) {
        List<Double> weights = new ArrayList<Double>();
        List<PairwiseComparison> comparisons = new ArrayList<PairwiseComparison>();
        
        PairwiseComparison comparison = new PairwiseComparison();
        comparison.setId(UUID.randomUUID().toString());
        comparison.setFirstDataset(new File("data/test1.pdftxt"));
        comparison.setSecondDataset(new File("data/test2.pdftxt"));
        comparison.setAdapterId(PDFAdapterTextAdapter.class.getName());
        comparison.setExtractorId(TextHistogramExtractor.class.getName());
        comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());

         comparisons.add(comparisonExecutionEngine ee = new ExecutionEngine(); 
        weights.add(0.7);
ee.submit(comparison, new ComparisonStatusHandler() {
            @Override
           comparison =public newvoid PairwiseComparisononStarted(); {
        comparison.setId(UUID.randomUUID().toString());
        comparisonSystem.out.setFirstDataset(new File("data/test1.pdf")println("STARTED : ");
         comparison.setSecondDataset(new File("data/test2.pdf"));
   }

            comparison.setAdapterId(PDFAdapter.class.getName());@Override
        comparison.setExtractorId(TextHistogramExtractor.class.getName());
    public void onFailed(String  comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());msg, Throwable e) {
        comparisons.add(comparison);
        weightsSystem.out.add(0.2);
println("FAILED  : "   + msg);
     
        comparison = new PairwiseComparisone.printStackTrace();
        comparison.setId(UUID.randomUUID().toString());
        comparisonSystem.setFirstDataset(new File("data/test1.pdf"))exit(0);
        comparison.setSecondDataset(new File("data/test2.pdf"));
    }

            comparison.setAdapterId(PDFAdapter.class.getName());@Override
        comparison.setExtractorId(TextHistogramExtractor.class.getName());
    public void onDone(double value) {
           comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName()     System.out.println("DONE    : " + value);
        comparisons.add(comparison);
        weightsSystem.addexit(0.1);

        ComprehensiveEngine engine = new ComprehensiveEngine(); }

        Double d = engine.compute(comparisons, weights); @Override
        System.out.println(d);
    public void onAborted(String    System.exit(0);msg) {
        
        ExecutionEngine ee = new ExecutionEngine(System.out.println("ABORTED : " + msg);

        ee.submit(comparison, new ComparisonStatusHandler() {
      System.exit(0);
      @Override
      }
      public void onStarted(}) {;
    }


    public TextAdapter() {}

      System.out.println("STARTED : ");// ----------------------------------------------------------------------
    // FileLoader
       }
// ----------------------------------------------------------------------
    @Override
    public void load(File file) @Override{
        this.file = file;
  public void onFailed(String msg, Throwable e) { }

    @Override
    public String getName() {
        return System.out.println("FAILED  : " + msg)"Text Document";
    }

    @Override
    public List<String>   e.printStackTracegetSupportedMediaTypes(); {
        List<String> mediaTypes =      System.exit(0new ArrayList<String>();
            }

mediaTypes.add("text/*");
            @Overridereturn mediaTypes;
    }

    // ----------------------------------------------------------------------
   public void onDone(double value) {// HasText
                System.out.println("DONE    : " + value);
                System.exit(0);
            }

            @Override
            public void onAborted(String msg) {
                System.out.println("ABORTED : " + msg);
                System.exit(0);
            }
        });
    }

    public PDFAdapter() {}

    // ----------------------------------------------------------------------
    // FileLoader
    // ----------------------------------------------------------------------

    @Override
    public void load(File file) {
        this.file = file;
    }

    @Override
    public String getName() {
        return "PDF Document";
    }

    @Override
    public List<String> getSupportedMediaTypes() {
        List<String> mediaTypes = new ArrayList<String>();
        mediaTypes.add("application/pdf");
        return mediaTypes;
    }

    // ----------------------------------------------------------------------
    // HasRGBPixels
    // ----------------------------------------------------------------------

    @Override
    public double getRGBPixel(int row, int column, int band) {
        if ((pixels == null) && (getRGBPixels() == null)) {
            return Double.NaN;
        } else {
            return pixels[row][column][band];
        }
    }

    @Override
    public double[][][] getRGBPixels() {
        if (pixels == null) {
            // create monster array.
            try {
                loadImages();
            } catch (IOException e) {
                e.printStackTrace();
                return null;
            }
        }

        return pixels;
    }

    private void loadImages() throws IOException {
        PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_IMAGES);

        // get all images in the pdf document
        List<PDFObjectImage> images = new ArrayList<PDFObjectImage>();

        for (int i = 0; i < parser.getPageCount(); i++) {
            parser.parse(i);

            for (PDFObject po : parser.getObjects()) {
                if (po instanceof PDFObjectImage) {
                    PDFObjectImage poi = (PDFObjectImage) po;
                    images.add(poi);
                }
            }
        }

        // create a virtual image that is all the images combined
        // first column is the image number
        // second column is pixel (col + row*width)
        // third column is RGB value
        pixels = new double[images.size()][][];

        for (int i = 0; i < images.size(); i++) {
            PDFObjectImage poi = images.get(i);
            int w = poi.getImage().getWidth();
            int h = poi.getImage().getHeight();
            int[] rgb = poi.getImage().getRGB(0, 0, w, h, null, 0, w);
            pixels[i] = new double[rgb.length][3];

            for (int j = 0; j < rgb.length; j++) {
                pixels[i][j][0] = (rgb[j] & 0xff0000) >> 16;
                pixels[i][j][1] = (rgb[j] & 0x00ff00) >> 8;
                pixels[i][j][2] = (rgb[j] & 0x0000ff) >> 0;
            }
        }

        // close the parser
        parser.close();
    }

    // ----------------------------------------------------------------------
    // HasText
    // ----------------------------------------------------------------------

    @Override
    public List<String> getWords() {
        if (words == null) {
            words = new ArrayList<String>();

            try {
                PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_TEXT);
                PDFGroupingText textgroup = new PDFGroupingText(PDFGroupingText.REMOVE_EMPTY_LINES);

                for (int i = 0; i < parser.getPageCount(); i++) {
                    parser.parse(i);
                    for (PDFObject po : textgroup.group(parser.getObjects())) {
                        if (po instanceof PDFObjectText) {
                            for (String s : ((PDFObjectText) po).getText().split("\\W+")) { //$NON-NLS-1$
                                if (!s.isEmpty()) {
                                    words.add(s);
                                }
                            }
                        }
                    }
                }

                parser.close();
            } catch (IOException e) {
                e.printStackTrace();
            }

        }

        return words;
    }

    // ----------------------------------------------------------------------
    // HasLineGraphics
    // ------------------------------------// --------------------------------------

    @Override
    public List<Path2D> getLineGraphics() {
        if (graphics == null) {
            graphics = new ArrayList<Path2D>();

            try {
                PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_GRAPHICS);
                PDFGroupingGraphics textgroup = new PDFGroupingGraphics();
---------------------------------
    @Override
    public List<String> getWords() {
                for (int i = 0; i < parser.getPageCount(); i++) if (words == null) {
            words =       parser.parse(inew ArrayList<String>();

                    for (PDFObject po : textgroup.group(parser.getObjects())) {
       try {
                BufferedReader br = new ifBufferedReader(new (po instanceof PDFObjectGraphics) {
FileReader(file));
                String line;
                graphics.addwhile(((PDFObjectGraphics) po).getPath());line = br.readLine()) != null) {
                    String[] w =  }line.split(" ");
                    }words.addAll(Arrays.asList(w));
                }

                parserbr.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return graphicswords;
    }
 }

Code Block

language	java
title	Text Histogram Extractor

public class TextHistogramExtractor implements Extractor
{
    @Override
    public Adapter newAdapter() {
        throw (new RuntimeException("Not supported."));
    }

    @Override
    public String getName() {
        return "Text Histogram Extractor";
    }

    @Override
    public Set<Class<? extends Adapter>> supportedAdapters() {
        Set<Class<? extends Adapter>> adapters = new HashSet<Class<? extends Adapter>>();
        adapters.add(HasText.class);
        return adapters;
    }

    @Override
    public Class<? extends Descriptor> getFeatureType() {
        return LabelHistogramDescriptor.class;
    }

    @Override
    public Descriptor extract(Adapter adapter) throws Exception {
        if (adapter instanceof HasText) {
            LabelHistogramDescriptor desc = new LabelHistogramDescriptor();

            for (String word : ((HasText) adapter).getWords()) {
                desc.increaseBin(word);
            }

            return desc;
        } else {
            throw new UnsupportedTypeException();
        }
    }
    
    @Override
    public boolean hasPreview(){
        return false;
    }
    
    @Override
    public String previewName(){
        return null;
    }
}

...

Page tree

Versions Compared

Old Version 121

New Version 122

Key