...
Code Block | ||||
---|---|---|---|---|
| ||||
public class PDFAdapter implements FileLoader, HasRGBPixels, HasText, HasLineGraphics {
private File file;
private double[][][] pixels;
private List<String> words;
private List<Path2D> graphics;
static public void main(String[] args) {
List<Double> weights = new ArrayList<Double>();
List<PairwiseComparison> comparisons = new ArrayList<PairwiseComparison>();
PairwiseComparison comparison = new PairwiseComparison();
comparison.setId(UUID.randomUUID().toString());
comparison.setFirstDataset(new File("data/test1.pdf"));
comparison.setSecondDataset(new File("data/test2.pdf"));
comparison.setAdapterId(PDFAdapter.class.getName());
comparison.setExtractorId(TextHistogramExtractor.class.getName());
comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
comparisons.add(comparison);
weights.add(0.7);
comparison = new PairwiseComparison();
comparison.setId(UUID.randomUUID().toString());
comparison.setFirstDataset(new File("data/test1.pdf"));
comparison.setSecondDataset(new File("data/test2.pdf"));
comparison.setAdapterId(PDFAdapter.class.getName());
comparison.setExtractorId(TextHistogramExtractor.class.getName());
comparison.setExtractorId(ImageHistogramExtractor.class.getName());
comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
comparisons.add(comparison);
weights.add(0.2);
comparison = new PairwiseComparison();
comparison.setId(UUID.randomUUID().toString());
comparison.setFirstDataset(new File("data/test1.pdf"));
comparison.setSecondDataset(new File("data/test2.pdf"));
comparison.setAdapterId(PDFAdapter.class.getName());
comparison.setExtractorId(TextHistogramExtractor.class.getName());
comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
comparisons.add(comparison);
weights.add(0.1);
ComprehensiveEngine engine = new ComprehensiveEngine();
Double d = engine.compute(comparisons, weights);
System.out.println(d);
System.exit(0);
ExecutionEngine ee = new ExecutionEngine();
ee.submit(comparison, new ComparisonStatusHandler() {
@Override
public void onStarted() {
System.out.println("STARTED : ");
}
@Override
public void onFailed(String msg, Throwable e) {
System.out.println("FAILED : " + msg);
e.printStackTrace();
System.exit(0);
}
@Override
public void onDone(double value) {
System.out.println("DONE : " + value);
System.exit(0);
}
@Override
public void onAborted(String msg) {
System.out.println("ABORTED : " + msg);
System.exit(0);
}
});
}
public PDFAdapter() {
}
// ----------------------------------------------------------------------
// FileLoader
// ----------------------------------------------------------------------
@Override
public void load(File file) {
this.file = file;
}
@Override
public String getName() {
return "PDF Document";
}
@Override
public List<String> getSupportedMediaTypes() {
List<String> mediaTypes = new ArrayList<String>();
mediaTypes.add("application/pdf");
return mediaTypes;
}
// ----------------------------------------------------------------------
// HasRGBPixels
// ----------------------------------------------------------------------
@Override
public double getRGBPixel(int row, int column, int band) {
if ((pixels == null) && (getRGBPixels() == null)) {
return Double.NaN;
} else {
return pixels[row][column][band];
}
}
@Override
public double[][][] getRGBPixels() {
if (pixels == null) {
// create monster array.
try {
loadImages();
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
return pixels;
}
private void loadImages() throws IOException {
PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_IMAGES);
// get all images in the pdf document
List<PDFObjectImage> images = new ArrayList<PDFObjectImage>();
for (int i = 0; i < parser.getPageCount(); i++) {
parser.parse(i);
for (PDFObject po : parser.getObjects()) {
if (po instanceof PDFObjectImage) {
PDFObjectImage poi = (PDFObjectImage) po;
images.add(poi);
}
}
}
// create a virtual image that is all the images combined
// first column is the image number
// second column is pixel (col + row*width)
// third column is RGB value
pixels = new double[images.size()][][];
for (int i = 0; i < images.size(); i++) {
PDFObjectImage poi = images.get(i);
int w = poi.getImage().getWidth();
int h = poi.getImage().getHeight();
int[] rgb = poi.getImage().getRGB(0, 0, w, h, null, 0, w);
pixels[i] = new double[rgb.length][3];
for (int j = 0; j < rgb.length; j++) {
pixels[i][j][0] = (rgb[j] & 0xff0000) >> 16;
pixels[i][j][1] = (rgb[j] & 0x00ff00) >> 8;
pixels[i][j][2] = (rgb[j] & 0x0000ff) >> 0;
}
}
// close the parser
parser.close();
}
// ----------------------------------------------------------------------
// HasText
// ----------------------------------------------------------------------
@Override
public List<String> getWords() {
if (words == null) {
words = new ArrayList<String>();
try {
PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_TEXT);
PDFGroupingText textgroup = new PDFGroupingText(PDFGroupingText.REMOVE_EMPTY_LINES);
for (int i = 0; i < parser.getPageCount(); i++) {
parser.parse(i);
for (PDFObject po : textgroup.group(parser.getObjects())) {
if (po instanceof PDFObjectText) {
for (String s : ((PDFObjectText) po).getText().split("\\W+")) { //$NON-NLS-1$
if (!s.isEmpty()) {
words.add(s);
}
}
}
}
}
parser.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return words;
}
// ----------------------------------------------------------------------
// HasLineGraphics
// ----------------------------------------------------------------------
@Override
public List<Path2D> getLineGraphics() {
if (graphics == null) {
graphics = new ArrayList<Path2D>();
try {
PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_GRAPHICS);
PDFGroupingGraphics textgroup = new PDFGroupingGraphics();
for (int i = 0; i < parser.getPageCount(); i++) {
parser.parse(i);
for (PDFObject po : textgroup.group(parser.getObjects())) {
if (po instanceof PDFObjectGraphics) {
graphics.add(((PDFObjectGraphics) po).getPath());
}
}
}
parser.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return graphics;
}
} |
...