[opencms-dev] How to make Lucene use Custom Extractors
Eska
sven.kiesow at interone.de
Fri Oct 30 11:36:26 CET 2009
Hi List,
in the meantime I found the solution for my Problem. OpenCms uses an
Extract-Cache (located in WEB-INF/index/extractCache) and only reindexes
files, if their content has been changed. So If you
would like to force a reindexing of all files using a custom extractor,
shutdown your tomcat and remove all files in WEB-INF/index/extractCache.
In addition make your class extend A_CmsVfsDocument
Eska
Eska wrote:
>
> Hi List,
>
> has anyone an Idea how to be able to configure a Custom Extractor that can
> be used while rebuilding the search indexes on OpenCms 7.0.5? We have an
> OpenCms
> where some Content has OpenCmsString defined, but the HtmlWidget is
> necessary to
> edit content. The problem is, that using the default configuration the
> Lucene Excerpts
> also hold HTML-Snippets, which are shown in the Search-Result. Our Client
> now wants
> the HTML-Code removed from the Search-Result-Excerpts.
>
> Changing the OpenCmsString to OpenCmsHtml is NOT an option.
>
> What I tried is the following:
>
> 1) create a custom extension of
> org.opencms.search.documents.CmsDocumentXmlContent
> (test.CmsDocumentXmlContent) with the Method
>
> "public I_cmsExtractionResult extractContent(CmsObject cms, CmsResource
> resource, CmsSearchIndex index) throws CmsException"
>
> being a copy of the original in the csv, but changed to always use the
> CmsHtmlExtractor to gather the extraced data. Further i Added some
> LOG.error() entries to see, if the method is invoked.
>
> 2) Configure that as a replacement in opencms-search.xml
>
> <documenttype>
> <name>xmlcontent</name>
> <class>test.CmsDocumentXmlContent</class>
> ...
> </documenttype>
>
> 3) The test.CmsDocumentXmlContent class is present in
> WEB-INF/classes/test/
>
> During Server start I can see the following Message in opencms.log:
>
> "Search document types: adding "xmlcontent" using handler
> test.CmsDocumentXmlContent"
>
> When I now try to rebuild a search index, it seems my custom
> CmsDocumentXmlContent
> is not invoked, because the log-output is not written to the opencms.log
> and the search-results
> are not changed and still show the HTML-Snippets.
>
> Any Ideas, how I can reach my goal or what I did wrong?
>
> Thanks
>
> Eska
>
>
Here is the Source I found to work:
package com.javaentwicklung;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import org.apache.commons.logging.Log;
import org.opencms.file.CmsFile;
import org.opencms.file.CmsObject;
import org.opencms.file.CmsResource;
import org.opencms.file.types.CmsResourceTypeXmlContent;
import org.opencms.file.types.I_CmsResourceType;
import org.opencms.main.CmsException;
import org.opencms.main.CmsLog;
import org.opencms.main.OpenCms;
import org.opencms.search.CmsIndexException;
import org.opencms.search.CmsSearchIndex;
import org.opencms.search.documents.A_CmsVfsDocument;
import org.opencms.search.documents.Messages;
import org.opencms.search.extractors.CmsExtractionResult;
import org.opencms.search.extractors.I_CmsExtractionResult;
import org.opencms.util.CmsHtmlExtractor;
import org.opencms.util.CmsStringUtil;
import org.opencms.xml.A_CmsXmlDocument;
import org.opencms.xml.content.CmsXmlContentFactory;
import org.opencms.xml.types.I_CmsXmlContentValue;
public class CmsDocumentXmlContentNew extends A_CmsVfsDocument {
private static final Log LOG =
CmsLog.getLog(com.javaentwicklung.CmsDocumentXmlContentNew.class);
public CmsDocumentXmlContentNew(String name) {
super(name);
}
/**
* Returns the raw text content of a given VFS resource of type
<code>CmsResourceTypeXmlContent</code>.<p>
*
* All XML nodes from the content for all locales will be stored
separately in the item map
* which you can access using {@link
CmsExtractionResult#getContentItems()}. The XML elements will be
* accessible using their xpath. The xpath will have the form like for
example
* <code>Text[1]</code> or <code>Nested[1]/Text[1]</code>.<p>
*
* @see
org.opencms.search.documents.I_CmsSearchExtractor#extractContent(CmsObject,
CmsResource, CmsSearchIndex)
*/
public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource
resource, CmsSearchIndex index)
throws CmsException {
try {
CmsFile file = readFile(cms, resource);
String absolutePath = cms.getSitePath(file);
A_CmsXmlDocument xmlContent =
CmsXmlContentFactory.unmarshal(cms, file);
List locales = xmlContent.getLocales();
if (locales.size() == 0) {
locales = OpenCms.getLocaleManager().getDefaultLocales(cms,
absolutePath);
}
Locale locale =
OpenCms.getLocaleManager().getBestMatchingLocale(
index.getLocale(),
OpenCms.getLocaleManager().getDefaultLocales(cms,
absolutePath),
locales);
List elements = xmlContent.getNames(locale);
StringBuffer content = new StringBuffer();
HashMap items = new HashMap();
for (Iterator i = elements.iterator(); i.hasNext();) {
String xpath = (String)i.next();
// xpath will have the form "Text[1]" or "Nested[1]/Text[1]"
boolean search = true;
/*
Here you can make a decision to exclude certain content
from being index by
checking the occurence of certain elements in the xpath,
e.g.
boolean search = true;
if (xpath.indexOf("/LinkExtern") > 0)
{
search = false;
}
else if (xpath.indexOf("/IframeUrl") > 0)
{
search = false;
}
else if (xpath.indexOf("/HomePageLink") > 0)
{
search = false;
}
*/
I_CmsXmlContentValue value = xmlContent.getValue(xpath,
locale);
if (search &&
value.getContentDefinition().getContentHandler().isSearchable(value)) {
// the content value is searchable
String extracted;
if (value.isSimpleType())
{
extracted =
CmsHtmlExtractor.extractText(value.getStringValue(cms),
xmlContent.getEncoding());
}
else
{
extracted = value.getPlainText(cms);
}
if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(extracted))
{
items.put(xpath, extracted);
content.append(extracted);
content.append('\n');
}
}
}
return new CmsExtractionResult(content.toString(), items);
} catch (Exception e) {
throw new CmsIndexException(
Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1,
resource.getRootPath()),
e);
}
}
/**
* @see
org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List,
java.util.List)
*/
public List getDocumentKeys(List resourceTypes, List mimeTypes) throws
CmsException {
if (resourceTypes.contains("*")) {
// we need to find all configured XML content types
ArrayList allTypes = new ArrayList();
for (Iterator i =
OpenCms.getResourceManager().getResourceTypes().iterator(); i.hasNext();) {
I_CmsResourceType resourceType =
(I_CmsResourceType)i.next();
if ((resourceType instanceof CmsResourceTypeXmlContent)
// either we need a configured schema, or another class name
(which must then contain an inline schema)
&&
(((CmsResourceTypeXmlContent)resourceType).getConfiguration().containsKey(
CmsResourceTypeXmlContent.CONFIGURATION_SCHEMA) ||
!CmsResourceTypeXmlContent.class.equals(resourceType.getClass()))) {
// add the XML content resource type name
allTypes.add(resourceType.getTypeName());
}
}
resourceTypes = allTypes;
}
return super.getDocumentKeys(resourceTypes, mimeTypes);
}
/**
* @see
org.opencms.search.documents.I_CmsDocumentFactory#isLocaleDependend()
*/
public boolean isLocaleDependend() {
return true;
}
/**
* @see org.opencms.search.documents.I_CmsDocumentFactory#isUsingCache()
*/
public boolean isUsingCache() {
return true;
}
}
--
View this message in context: http://old.nabble.com/How-to-make-Lucene-use-Custom-Extractors-tp25747875p26127886.html
Sent from the OpenCMS - Dev mailing list archive at Nabble.com.
More information about the opencms-dev
mailing list