Confluence 2.8 : Attachment Content Extractor Plugins
This page last changed on Jan 09, 2006 by cmiller.
Attachment content extractor plugins enable Confluence to index the contents of attachments that it may not otherwise understand. Before you read this document, you should be familiar with Extractor Plugins. The BaseAttachmentContentExtractor classAttachment content extractor plugins must extend the bucket.search.lucene.extractor.BaseAttachmentContentExtractor base class. The skeleton of this class is: package bucket.search.lucene.extractor; import bucket.search.lucene.Extractor; import bucket.search.lucene.SearchableAttachment; import bucket.search.Searchable; import org.apache.lucene.document.Document; import com.opensymphony.util.TextUtils; import java.io.InputStream; import java.io.IOException; public abstract class BaseAttachmentContentExtractor implements Extractor { /** You should not have to override this method */ public void addFields(Document document, StringBuffer defaultSearchableText, Searchable searchable); /** Override this method if you can not get the functionality you want by overriding getMatchingContentTypes() and getMatchingFilenameExtensions() */ protected boolean shouldExtractFrom(String fileName, String contentType); /** Override this method to return the MIME content-types that your plugin knows how to extract text from. If you have already overridden shouldExtractFrom(), this method is useless */ protected String[] getMatchingContentTypes() { return new String[0]; } /** Override this method to return the filename extensions that your plugin knows how to extract text from. If you have already overridden shouldExtractFrom(), this method is useless */ protected String[] getMatchingFileExtensions() { return new String[0]; } /** Override this method to do the actual work of extracting the content of the attachment. Your extractor should return the text that is to be indexed */ protected abstract String extractText(InputStream is, SearchableAttachment attachment) throws IOException; }
An ExampleThis is an example of a hypothetical extractor that extracts the contents of mp3 ID3 tags. package com.example.extras.extractor; import.com.hypothetical.id3.Id3Tag import bucket.search.lucene.extractor.BaseAttachmentContentExtractor; import bucket.search.lucene.SearchableAttachment; import java.io.InputStream; import java.io.IOException; public class Id3Extractor extends BaseAttachmentContentExtractor { public static final String[] MIME_TYPES = {"audio/x-mp3", "audio/mpeg", "audio/mp4a-latm"}; public static final String[] FILE_EXTS = {"mp3", "m4a"}; protected String extractText(InputStream is, SearchableAttachment attachment) throws IOException { Id3Tag tag = Id3Tag.parse(is); return (tag.getTitle() + " " + tag.getArtist() + " " + tag.getGenre() + " " + tag.getAlbumTitle()); } protected String[] getMatchingContentTypes() { return MIME_TYPES; } protected String[] getMatchingFileExtensions() { return FILE_EXTS; } } |
![]() |
Document generated by Confluence on Jun 24, 2008 18:04 |