public class PDFAdapterTextAdapter implements FileLoader, HasRGBPixels, HasText, HasLineGraphics
{
private File file;
private double[][][] pixels;
private List<String> words;
private List<Path2D> graphics;
static public void main(String[] args) {
List<Double> weights = new ArrayList<Double>();
List<PairwiseComparison> comparisons = new ArrayList<PairwiseComparison>();
PairwiseComparison comparison = new PairwiseComparison();
comparison.setId(UUID.randomUUID().toString());
comparison.setFirstDataset(new File("data/test1.pdftxt"));
comparison.setSecondDataset(new File("data/test2.pdftxt"));
comparison.setAdapterId(PDFAdapterTextAdapter.class.getName());
comparison.setExtractorId(TextHistogramExtractor.class.getName());
comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
comparisons.add(comparisonExecutionEngine ee = new ExecutionEngine();
weights.add(0.7);
ee.submit(comparison, new ComparisonStatusHandler() {
@Override
comparison =public newvoid PairwiseComparisononStarted(); {
comparison.setId(UUID.randomUUID().toString());
comparisonSystem.out.setFirstDataset(new File("data/test1.pdf")println("STARTED : ");
comparison.setSecondDataset(new File("data/test2.pdf"));
}
comparison.setAdapterId(PDFAdapter.class.getName());@Override
comparison.setExtractorId(TextHistogramExtractor.class.getName());
public void onFailed(String comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());msg, Throwable e) {
comparisons.add(comparison);
weightsSystem.out.add(0.2);
println("FAILED : " + msg);
comparison = new PairwiseComparisone.printStackTrace();
comparison.setId(UUID.randomUUID().toString());
comparisonSystem.setFirstDataset(new File("data/test1.pdf"))exit(0);
comparison.setSecondDataset(new File("data/test2.pdf"));
}
comparison.setAdapterId(PDFAdapter.class.getName());@Override
comparison.setExtractorId(TextHistogramExtractor.class.getName());
public void onDone(double value) {
comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName() System.out.println("DONE : " + value);
comparisons.add(comparison);
weightsSystem.addexit(0.1);
ComprehensiveEngine engine = new ComprehensiveEngine(); }
Double d = engine.compute(comparisons, weights); @Override
System.out.println(d);
public void onAborted(String System.exit(0);msg) {
ExecutionEngine ee = new ExecutionEngine(System.out.println("ABORTED : " + msg);
ee.submit(comparison, new ComparisonStatusHandler() {
System.exit(0);
@Override
}
public void onStarted(}) {;
}
public TextAdapter() {}
System.out.println("STARTED : ");// ----------------------------------------------------------------------
// FileLoader
}
// ----------------------------------------------------------------------
@Override
public void load(File file) @Override{
this.file = file;
public void onFailed(String msg, Throwable e) { }
@Override
public String getName() {
return System.out.println("FAILED : " + msg)"Text Document";
}
@Override
public List<String> e.printStackTracegetSupportedMediaTypes(); {
List<String> mediaTypes = System.exit(0new ArrayList<String>();
}
mediaTypes.add("text/*");
@Overridereturn mediaTypes;
}
// ----------------------------------------------------------------------
public void onDone(double value) {// HasText
System.out.println("DONE : " + value);
System.exit(0);
}
@Override
public void onAborted(String msg) {
System.out.println("ABORTED : " + msg);
System.exit(0);
}
});
}
public PDFAdapter() {}
// ----------------------------------------------------------------------
// FileLoader
// ----------------------------------------------------------------------
@Override
public void load(File file) {
this.file = file;
}
@Override
public String getName() {
return "PDF Document";
}
@Override
public List<String> getSupportedMediaTypes() {
List<String> mediaTypes = new ArrayList<String>();
mediaTypes.add("application/pdf");
return mediaTypes;
}
// ----------------------------------------------------------------------
// HasRGBPixels
// ----------------------------------------------------------------------
@Override
public double getRGBPixel(int row, int column, int band) {
if ((pixels == null) && (getRGBPixels() == null)) {
return Double.NaN;
} else {
return pixels[row][column][band];
}
}
@Override
public double[][][] getRGBPixels() {
if (pixels == null) {
// create monster array.
try {
loadImages();
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
return pixels;
}
private void loadImages() throws IOException {
PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_IMAGES);
// get all images in the pdf document
List<PDFObjectImage> images = new ArrayList<PDFObjectImage>();
for (int i = 0; i < parser.getPageCount(); i++) {
parser.parse(i);
for (PDFObject po : parser.getObjects()) {
if (po instanceof PDFObjectImage) {
PDFObjectImage poi = (PDFObjectImage) po;
images.add(poi);
}
}
}
// create a virtual image that is all the images combined
// first column is the image number
// second column is pixel (col + row*width)
// third column is RGB value
pixels = new double[images.size()][][];
for (int i = 0; i < images.size(); i++) {
PDFObjectImage poi = images.get(i);
int w = poi.getImage().getWidth();
int h = poi.getImage().getHeight();
int[] rgb = poi.getImage().getRGB(0, 0, w, h, null, 0, w);
pixels[i] = new double[rgb.length][3];
for (int j = 0; j < rgb.length; j++) {
pixels[i][j][0] = (rgb[j] & 0xff0000) >> 16;
pixels[i][j][1] = (rgb[j] & 0x00ff00) >> 8;
pixels[i][j][2] = (rgb[j] & 0x0000ff) >> 0;
}
}
// close the parser
parser.close();
}
// ----------------------------------------------------------------------
// HasText
// ----------------------------------------------------------------------
@Override
public List<String> getWords() {
if (words == null) {
words = new ArrayList<String>();
try {
PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_TEXT);
PDFGroupingText textgroup = new PDFGroupingText(PDFGroupingText.REMOVE_EMPTY_LINES);
for (int i = 0; i < parser.getPageCount(); i++) {
parser.parse(i);
for (PDFObject po : textgroup.group(parser.getObjects())) {
if (po instanceof PDFObjectText) {
for (String s : ((PDFObjectText) po).getText().split("\\W+")) { //$NON-NLS-1$
if (!s.isEmpty()) {
words.add(s);
}
}
}
}
}
parser.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return words;
}
// ----------------------------------------------------------------------
// HasLineGraphics
// ------------------------------------// --------------------------------------
@Override
public List<Path2D> getLineGraphics() {
if (graphics == null) {
graphics = new ArrayList<Path2D>();
try {
PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_GRAPHICS);
PDFGroupingGraphics textgroup = new PDFGroupingGraphics();
---------------------------------
@Override
public List<String> getWords() {
for (int i = 0; i < parser.getPageCount(); i++) if (words == null) {
words = parser.parse(inew ArrayList<String>();
for (PDFObject po : textgroup.group(parser.getObjects())) {
try {
BufferedReader br = new ifBufferedReader(new (po instanceof PDFObjectGraphics) {
FileReader(file));
String line;
graphics.addwhile(((PDFObjectGraphics) po).getPath());line = br.readLine()) != null) {
String[] w = }line.split(" ");
}words.addAll(Arrays.asList(w));
}
parserbr.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return graphicswords;
}
} |