...
Medici extractors typically serve to automatically extract some new kind of information from a file's content when it is uploaded into Medici. These extractors do this by connecting to a shared RabbitMQ bus. When a new file is uploaded to Medici it is announced on this bus. Extractors that can handle a file of the type posted on the bus are triggered and the data they in turn create is returned to Medici as derived data to be associated with that file. The extractors themselves can be implemented in a variety of languages. Examples of these extractors in different languages can be found in the extractors-templates code repository.
Java
An extractor must establish a connection with the Medici RabbitMQ bus, handle incoming messages, start jobs based on received messages, and ultimatley carry out a job on a given file. The example below simply counts the number of words in a document and returns this information as a piece of metadata to be associated with the file.
Code Block |
---|
theme | Emacs |
---|
language | java |
---|
title | Connecting to RabbitMQ |
---|
|
protected void startExtractor(String rabbitMQUsername, String rabbitMQpassword)
{
try{
//Open channel and declare exchange and consumer
ConnectionFactory factory = new ConnectionFactory();
factory.setHost(serverAddr);
factory.setUsername(rabbitMQUsername);
factory.setPassword(rabbitMQpassword);
Connection connection = factory.newConnection();
final Channel channel = connection.createChannel();
channel.exchangeDeclare(EXCHANGE_NAME, "topic", true);
channel.queueDeclare(QUEUE_NAME,DURABLE,EXCLUSIVE,AUTO_DELETE,null);
channel.queueBind(QUEUE_NAME, EXCHANGE_NAME, "*.file.text.plain.#");
this.channel = channel;
// create listener
channel.basicConsume(QUEUE_NAME, false, CONSUMER_TAG, new DefaultConsumer(channel) {
@Override
public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) throws IOException {
messageReceived = new String(body);
long deliveryTag = envelope.getDeliveryTag();
// (process the message components here ...)
System.out.println(" {x} Received '" + messageReceived + "'");
replyProps = new AMQP.BasicProperties.Builder().correlationId(properties.getCorrelationId()).build();
replyTo = properties.getReplyTo();
processMessageReceived();
System.out.println(" [x] Done");
channel.basicAck(deliveryTag, false);
}
});
// start listening
System.out.println(" [*] Waiting for messages. To exit press CTRL+C");
while (true) {
Thread.sleep(1000);
}
}catch(Exception e){
e.printStackTrace();
System.exit(1);
}
} |
Code Block |
---|
language | java |
---|
title | Processing Messages |
---|
|
protected void processMessageReceived()
{
try {
try {
ExampleJavaExtractorService extrServ = new ExampleJavaExtractorService(this);
jobReceived = getRepresentation(messageReceived, ExtractionJob.class);
File textFile = extrServ.processJob(jobReceived);
jobReceived.setFlag("wasText");
log.info("Word count extraction complete. Returning word count file as intermediate result.");
sendStatus(jobReceived.getId(), this.getClass().getSimpleName(), "Word count extraction complete. Returning word count file as intermediate result.", log);
uploadIntermediate(textFile, "text/plain", log);
textFile.delete();
sendStatus(jobReceived.getId(), this.getClass().getSimpleName(), "DONE.", log);
} catch (Exception ioe) {
log.error("Could not finish extraction job.", ioe);
sendStatus(jobReceived.getId(), this.getClass().getSimpleName(), "Could not finish extraction job.", log);
sendStatus(jobReceived.getId(), this.getClass().getSimpleName(), "DONE.", log);
}
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
} |
Code Block |
---|
language | java |
---|
title | Processing Jobs |
---|
|
public File processJob(ExtractionJob receivedMsg) throws Exception
{
log.info("Downloading text file with ID "+ receivedMsg.getIntermediateId() +" from " + receivedMsg.getHost());
callingExtractor.sendStatus(receivedMsg.getId(), callingExtractor.getClass().getSimpleName(), "Downloading text file.", log);
DefaultHttpClient httpclient = new DefaultHttpClient();
HttpGet httpGet = new HttpGet(receivedMsg.getHost() +"api/files/"+ receivedMsg.getIntermediateId()+"?key="+playserverKey);
HttpResponse fileResponse = httpclient.execute(httpGet);
log.info(fileResponse.getStatusLine());
if(fileResponse.getStatusLine().toString().indexOf("200") == -1){
throw new IOException("File not found.");
}
HttpEntity fileEntity = fileResponse.getEntity();
InputStream fileIs = fileEntity.getContent();
Header[] hdrs = fileResponse.getHeaders("content-disposition");
String contentDisp = hdrs[0].toString();
String fileName = contentDisp.substring(contentDisp.indexOf("filename=")+9);
File tempFile = File.createTempFile(fileName.substring(0, fileName.lastIndexOf(".")), fileName.substring(fileName.lastIndexOf(".")).toLowerCase());
OutputStream fileOs = new FileOutputStream(tempFile);
IOUtils.copy(fileIs,fileOs);
fileIs.close();
fileOs.close();
EntityUtils.consume(fileEntity);
log.info("Download complete. Initiating word count generation");
File textFile = processFile(tempFile, receivedMsg.getId());
return textFile;
} |
Code Block |
---|
language | java |
---|
title | Processing Files |
---|
|
private File processFile(File tempFile, String originalFileId) throws Exception
{
Runtime r = Runtime.getRuntime();
Process p; // Process tracks one external native process
String tempDir = System.getProperty("java.io.tmpdir");
if (new Character(tempDir.charAt(tempDir.length()-1)).toString().equals(System.getProperty("file.separator")) == false){
tempDir = tempDir + System.getProperty("file.separator");
}
String processCmd = "";
String operSystem = System.getProperty("os.name").toLowerCase();
// TODO: windows impl
if(operSystem.indexOf("nix") >= 0 || operSystem.indexOf("nux") >= 0 || operSystem.indexOf("aix") > 0 ){
"wc -w " + tempDir + tempFile.getName();
}
p = r.exec(processCmd, null, new File(tempDir));
StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), "INFO", log);
StreamGobbler errorGobbler = new StreamGobbler(p.getErrorStream(),"ERROR", log);
outputGobbler.start();
errorGobbler.start();
p.waitFor();
File outFile = new File(tempDir + tempFile.getName().substring(0, tempFile.getName().lastIndexOf(".")) + ".txt");
tempFile.delete();
if(!Files.exists(outFile.toPath()))
throw new Exception("File not processed correctly. File is possibly corrupt.");
return outFile;
} |
...
A complete example of the python extractor can be found at Medici Extractor in Python
Calling R Scripts from Python
Coming soon...
...
Java extractors can be created using the amqp-client jar file. This allows you to connect to the RabitMQ bus and received messages. The easiest way to get up and running is to use maven with java to add all required dependencies. An example of an extractor written in java can be found at Medici Extractor in Java.
PythonPython extractors will often be based on the packages pika and requests. This allows you to connect to the RabittMQ message bus and easily send requests to medici. A complete example of the python extractor can be found at Medici Extractor in Python
Calling R Scripts from Python
Coming soon...
Versus Extractors
Versus extractors serve to extract a signature from a file's content. These signatures, effectively a hash for the data, are typically numerical vectors which capture some semantically meaningful aspect of the content so that two such signatures can then be compared using some distance measure. Within Versus extractors operate on a data structure representing the content of a file, produced a Versus adapter, and the returned signatures compared by either a Versus similarity or distance measure. The combination of these adapters, extractors, and measures in turn compose a comparison which can be used for relating files according their contents.
JavaThe main class sets up the comparison, this is done by adding the two files that need to be compared, as well as the adapter to load the file, the extractor to extract a feature from the file, and a measurement to compare the two features.
Code Block |
---|
|
static public void main(String[] args) {
PairwiseComparison comparison = new PairwiseComparison();
comparison.setId(UUID.randomUUID().toString() |
...
Versus extractors serve to extract a signature from a file's content. These signatures, effectively a hash for the data, are typically numerical vectors which capture some semantically meaningful aspect of the content so that two such signatures can then be compared using some distance measure. Within Versus extractors operate on a data structure representing the content of a file, produced a Versus adapter, and the returned signatures compared by either a Versus similarity or distance measure. The combination of these adapters, extractors, and measures in turn compose a comparison which can be used for relating files according their contents.
...
Code Block |
---|
language | java |
---|
title | PDF Adapter |
---|
|
public class PDFAdapter implements FileLoader, HasRGBPixels, HasText, HasLineGraphics
{
private File file;
private double[][][] pixels;
private List<String> words;
private List<Path2D> graphics;
static public void main(String[] args) {
List<Double> weights = new ArrayList<Double>();
List<PairwiseComparison> comparisons = new ArrayList<PairwiseComparison>();
PairwiseComparison comparison = new PairwiseComparison();
comparison.setId(UUID.randomUUID().toString());
comparison.setFirstDataset(new File("data/test1.pdf"));
comparison.setSecondDataset(new File("data/test2.pdf"));
comparison.setAdapterId(PDFAdapter.class.getName());
comparison.setExtractorId(TextHistogramExtractor.class.getName());
comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
comparisons.add(comparison);
weights.add(0.7);
comparison = new PairwiseComparison();
comparison.setId(UUID.randomUUID().toString());
comparison.setFirstDataset(new File("data/test1.pdf"));
comparison.setSecondDataset(new File("data/test2.pdf"));
comparison.setAdapterId(PDFAdapter.class.getName());
comparison.setExtractorId(TextHistogramExtractor.class.getName());
comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
comparisons.add(comparison);
weights.add(0.2);
comparison = new PairwiseComparison();
comparison.setId(UUID.randomUUID().toString());
comparison.setFirstDataset(new File("data/test1.pdf"));
comparison.setSecondDataset(new File("data/test2.pdf"));
comparison.setAdapterId(PDFAdapter.class.getName());
comparison.setExtractorId(TextHistogramExtractor.class.getName());
comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
comparisons.add(comparison);
weights.add(0.1);
ComprehensiveEngine engine = new ComprehensiveEngine();
Double d = engine.compute(comparisons, weights);
System.out.println(d);
System.exit(0);
ExecutionEngine ee = new ExecutionEngine();
ee.submit(comparison, new ComparisonStatusHandler() {
@Override
public void onStarted() {
System.out.println("STARTED : ");
}
@Override
public void onFailed(String msg, Throwable e) {
System.out.println("FAILED : " + msg);
e.printStackTrace();
System.exit(0);
}
@Override
public void onDone(double value) {
System.out.println("DONE : " + value);
System.exit(0);
}
@Override
public void onAborted(String msg) {
System.out.println("ABORTED : " + msg);
System.exit(0);
}
});
}
public PDFAdapter() {}
// ----------------------------------------------------------------------
// FileLoader
// ----------------------------------------------------------------------
@Override
public void load(File file) {
this.file = file;
}
@Override
public String getName() {
return "PDF Document";
}
@Override
public List<String> getSupportedMediaTypes() {
List<String> mediaTypes = new ArrayList<String>();
mediaTypes.add("application/pdf");
return mediaTypes;
}
// ----------------------------------------------------------------------
// HasRGBPixels
// ----------------------------------------------------------------------
@Override
public double getRGBPixel(int row, int column, int band) {
if ((pixels == null) && (getRGBPixels() == null)) {
return Double.NaN;
} else {
return pixels[row][column][band];
}
}
@Override
public double[][][] getRGBPixels() {
if (pixels == null) {
// create monster array.
try {
loadImages();
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
return pixels;
}
private void loadImages() throws IOException {
PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_IMAGES);
// get all images in the pdf document
List<PDFObjectImage> images = new ArrayList<PDFObjectImage>();
for (int i = 0; i < parser.getPageCount(); i++) {
parser.parse(i);
for (PDFObject po : parser.getObjects()) {
if (po instanceof PDFObjectImage) {
PDFObjectImage poi = (PDFObjectImage) po;
images.add(poi);
}
}
}
// create a virtual image that is all the images combined
// first column is the image number
// second column is pixel (col + row*width)
// third column is RGB value
pixels = new double[images.size()][][];
for (int i = 0; i < images.size(); i++) {
PDFObjectImage poi = images.get(i);
int w = poi.getImage().getWidth();
int h = poi.getImage().getHeight();
int[] rgb = poi.getImage().getRGB(0, 0, w, h, null, 0, w);
pixels[i] = new double[rgb.length][3];
for (int j = 0; j < rgb.length; j++) {
pixels[i][j][0] = (rgb[j] & 0xff0000) >> 16;
pixels[i][j][1] = (rgb[j] & 0x00ff00) >> 8;
pixels[i][j][2] = (rgb[j] & 0x0000ff) >> 0;
}
}
// close the parser
parser.close();
}
// ----------------------------------------------------------------------
// HasText
// ----------------------------------------------------------------------
@Override
public List<String> getWords() {
if (words == null) {
words = new ArrayList<String>();
try {
PDFParser parser = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_TEXT);
comparison.setFirstDataset(new File("data/test1.txt"));
PDFGroupingText textgroup = comparison.setSecondDataset(new PDFGroupingText(PDFGroupingText.REMOVE_EMPTY_LINESFile("data/test2.txt"));
comparison.setAdapterId(TextAdapter.class.getName());
comparison.setExtractorId(TextHistogramExtractor.class.getName());
for (int i = 0; i < parser.getPageCount(); i++) {
comparison.setMeasureId(LabelHistogramEuclidianDistanceMeasure.class.getName());
ExecutionEngine ee = new ExecutionEngine();
ee.submit(comparison, new parser.parseComparisonStatusHandler(i); {
@Override
for (PDFObject popublic : textgroup.group(parser.getObjectsvoid onStarted())) {
System.out.println("STARTED : ");
if (po instanceof PDFObjectText) { }
@Override
public void for onFailed(String smsg, :Throwable ((PDFObjectText) po).getText().split("\\W+")) { //$NON-NLS-1$
e) {
System.out.println("FAILED : " + msg);
if (!se.isEmptyprintStackTrace()) {;
System.exit(0);
}
words.add(s);
@Override
public void onDone(double value) {
}
System.out.println("DONE : " + value);
}System.exit(0);
}
}@Override
public void onAborted(String msg) {
}
System.out.println("ABORTED : " + }
msg);
parserSystem.closeexit(0);
} catch (IOException e) {
});
e.printStackTrace();
}
} |
The text adapter will take a text file, and load all the file, splitting the text into words and return a list of all words in the text. The words are still in the right order, and it is possible to read the original information of the file by reading the words in the order as they are returned by getWords().
Code Block |
---|
language | java |
---|
title | Text Adapter |
---|
|
public class TextAdapter implements FileLoader, HasText {
private File }
file;
private returnList<String> words;
public TextAdapter() {}
// ----------------------------------------------------------------------
// HasLineGraphicsFileLoader
// ----------------------------------------------------------------------
@Override
public List<Path2D>void getLineGraphicsload(File file) {
if (graphics == null) {this.file = file;
}
@Override
graphicspublic =String new ArrayList<Path2D>getName(); {
return "Text Document";
try {}
@Override
public List<String> getSupportedMediaTypes() {
PDFParserList<String> parsermediaTypes = new PDFParser(new FileInputStream(file), PDFParser.EXTRACT_GRAPHICSArrayList<String>();
mediaTypes.add("text/*");
PDFGroupingGraphics textgroup = new PDFGroupingGraphics()return mediaTypes;
}
// ----------------------------------------------------------------------
// HasText
for (int i = 0; i < parser.getPageCount(); i++) {// ----------------------------------------------------------------------
@Override
public List<String> getWords() {
if parser.parse(i);
(words == null) {
words = new ArrayList<String>();
for (PDFObject po : textgroup.group(parser.getObjects())) {
try {
BufferedReader br = new ifBufferedReader(new (po instanceof PDFObjectGraphics) {
FileReader(file));
String line;
graphics.addwhile(((PDFObjectGraphics) po).getPathline = br.readLine()); != null) {
String[] w = }line.split(" ");
}words.addAll(Arrays.asList(w));
}
parserbr.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return words;
return graphics;
}
} }
} |
The extractor will take the words returned by the adapter and count the occurrence of each word. At this point we are left with a histogram with all words and how often they occur in the text, we can no longer read the text since the information about the order of the words is lost.
Code Block |
---|
language | java |
---|
title | Text Histogram Extractor |
---|
|
public class TextHistogramExtractor implements Extractor
{
@Override
public Adapter newAdapter() {
throw (new RuntimeException("Not supported."));
}
@Override
public String getName() {
return "Text Histogram Extractor";
}
@Override
public Set<Class<? extends Adapter>> supportedAdapters() {
Set<Class<? extends Adapter>> adapters = new HashSet<Class<? extends Adapter>>();
adapters.add(HasText.class);
return adapters;
}
@Override
public Class<? extends Descriptor> getFeatureType() {
return LabelHistogramDescriptor.class;
}
@Override
public Descriptor extract(Adapter adapter) throws Exception {
if (adapter instanceof HasText) {
LabelHistogramDescriptor desc = new LabelHistogramDescriptor();
for (String word : ((HasText) adapter).getWords()) {
desc.increaseBin(word);
}
return desc;
} else {
throw new UnsupportedTypeException();
}
}
@Override
public boolean hasPreview(){
return false;
}
@Override
public String previewName(){
return null;
}
} |
To compare two texts we use the euclidian distance measure of two histograms. First we normalize each histogram, so we can compare a large text with a small text, next we compare each big of the two histograms. If the bin is missing from either histogram it is assumed to have a value of 0.
Code Block |
---|
language | java |
---|
title | Euclidian Distance Measure |
---|
|
public class LabelHistogramEuclidianDistanceMeasure implements Measure
{
@Override
public SimilarityPercentage normalize(Similarity similarity) {
return new SimilarityPercentage(1 - similarity.getValue());
}
@Override
public String getFeatureType() {
return LabelHistogramDescriptor.class.getName();
}
@Override
public String getName() {
return "Histogram Distance";
}
@Override
public Class<LabelHistogramEuclidianDistanceMeasure> getType() {
return LabelHistogramEuclidianDistanceMeasure.class;
}
// correlation
@Override
public Similarity compare(Descriptor desc1, Descriptor desc2) throws Exception {
if ((desc1 instanceof LabelHistogramDescriptor) && (desc2 instanceof LabelHistogramDescriptor)) {
LabelHistogramDescriptor lhd1 = (LabelHistogramDescriptor) desc1;
LabelHistogramDescriptor lhd2 = (LabelHistogramDescriptor) desc2;
// get all possible labels
Set<String> labels = new HashSet<String>();
labels.addAll(lhd1.getLabels());
labels.addAll(lhd2.getLabels());
// normalize
lhd1.normalize();
lhd2.normalize();
// compute distance
double sum = 0;
for (String s : labels) {
Double b1 = lhd1.getBin(s);
Double b2 = lhd2.getBin(s);
if (b1 == null) {
sum += b2 * b2;
} else if (b2 == null) {
sum += b1 * b1;
} else {
sum += (b1 - b2) * (b1 - b2);
}
}
return new SimilarityNumber(Math.sqrt(sum), 0, 1, 0);
} else {
throw new UnsupportedTypeException();
}
}
} |