public abstract class AbstractCrawlerConfig extends Object implements ICrawlerConfig
ICrawlerConfig.OrphansStrategy
Constructor and Description |
---|
AbstractCrawlerConfig()
Creates a new crawler configuration.
|
Modifier and Type | Method and Description |
---|---|
protected <T> T[] |
defaultIfEmpty(T[] array,
T[] defaultArray) |
boolean |
equals(Object other) |
ICommitter |
getCommitter()
Gets the Committer module configuration.
|
ICrawlDataStoreFactory |
getCrawlDataStoreFactory()
Gets the crawl data store factory a crawler should use.
|
ICrawlerEventListener[] |
getCrawlerListeners()
Gets crawler event listeners.
|
IDocumentChecksummer |
getDocumentChecksummer()
Gets the document checksummer.
|
IDocumentFilter[] |
getDocumentFilters()
Gets the document filters.
|
String |
getId()
Gets this crawler unique identifier.
|
ImporterConfig |
getImporterConfig()
Gets the Importer module configuration.
|
int |
getMaxDocuments()
Gets the maximum number of documents that can be processed.
|
IMetadataFilter[] |
getMetadataFilters()
Gets the metadata filters.
|
int |
getNumThreads()
Gets the number of threads (maximum) a crawler should use.
|
ICrawlerConfig.OrphansStrategy |
getOrphansStrategy()
Gets the strategy to adopt when there are orphans.
|
IReferenceFilter[] |
getReferenceFilters()
Gets the reference filters
|
ISpoiledReferenceStrategizer |
getSpoiledReferenceStrategizer()
Gets the spoiled state strategy resolver.
|
Class<? extends Exception>[] |
getStopOnExceptions()
Gets the exceptions we want to stop the crawler on.
|
File |
getWorkDir()
Gets the crawler working directory where many files created at
execution time are stored.
|
int |
hashCode() |
protected abstract void |
loadCrawlerConfigFromXML(XMLConfiguration xml) |
void |
loadFromXML(Reader in) |
protected abstract void |
saveCrawlerConfigToXML(Writer out) |
void |
saveToXML(Writer out) |
void |
setCommitter(ICommitter committer) |
void |
setCrawlDataStoreFactory(ICrawlDataStoreFactory crawlDataStoreFactory) |
void |
setCrawlerListeners(ICrawlerEventListener... crawlerListeners) |
void |
setDocumentChecksummer(IDocumentChecksummer documentChecksummer) |
void |
setDocumentFilters(IDocumentFilter... documentfilters) |
void |
setId(String id)
Sets this crawler unique identifier.
|
void |
setImporterConfig(ImporterConfig importerConfig) |
void |
setMaxDocuments(int maxDocuments) |
void |
setMetadataFilters(IMetadataFilter... metadataFilters) |
void |
setNumThreads(int numThreads) |
void |
setOrphansStrategy(ICrawlerConfig.OrphansStrategy orphansStrategy) |
void |
setReferenceFilters(IReferenceFilter... referenceFilters)
Sets the reference filters.
|
void |
setSpoiledReferenceStrategizer(ISpoiledReferenceStrategizer spoiledReferenceStrategizer) |
void |
setStopOnExceptions(Class<? extends Exception>... stopOnExceptions)
Sets the exceptions we want to stop the crawler on.
|
void |
setWorkDir(File workDir) |
String |
toString() |
protected void |
writeArray(Writer out,
String listTagName,
String objectTagName,
Object[] array) |
protected void |
writeObject(Writer out,
String tagName,
Object object) |
protected void |
writeObject(Writer out,
String tagName,
Object object,
boolean ignore) |
public AbstractCrawlerConfig()
public String getId()
getId
in interface ICrawlerConfig
public void setId(String id)
id
- unique identifierpublic int getNumThreads()
ICrawlerConfig
getNumThreads
in interface ICrawlerConfig
public void setNumThreads(int numThreads)
public File getWorkDir()
ICrawlerConfig
getWorkDir
in interface ICrawlerConfig
public void setWorkDir(File workDir)
public int getMaxDocuments()
ICrawlerConfig
getMaxDocuments
in interface ICrawlerConfig
public void setMaxDocuments(int maxDocuments)
public ICrawlerConfig.OrphansStrategy getOrphansStrategy()
ICrawlerConfig
Gets the strategy to adopt when there are orphans. Orphans are references that were processed in a previous run, but were not in the current run. In other words, they are leftovers from a previous run that were not re-encountered in the current.
Unless explicitly stated otherwise by an implementing class, the default
strategy is to PROCESS
orphans.
Setting a null
value is the same as setting
IGNORE
.
Since 1.2.0, unless otherwise stated in implementing classes,
the default orphan strategy is now PROCESS
.
Be careful: Setting the orphan strategy to DELETE
is NOT recommended in most cases. With some collectors, a temporary
failure such as a network outage or a web page timing out, may cause
some documents not to be crawled. When this happens, unreachable
documents would be considered "orphans" and be deleted while under
normal circumstances, they should be kept. Re-processing them
(default), is usually the safest approach to confirm they still
exist before deleting or updating them.
getOrphansStrategy
in interface ICrawlerConfig
public void setOrphansStrategy(ICrawlerConfig.OrphansStrategy orphansStrategy)
public Class<? extends Exception>[] getStopOnExceptions()
ICrawlerConfig
getStopOnExceptions
in interface ICrawlerConfig
public void setStopOnExceptions(Class<? extends Exception>... stopOnExceptions)
stopOnExceptions
- exceptions that will stop the crawler when
encounteredpublic ICrawlDataStoreFactory getCrawlDataStoreFactory()
ICrawlerConfig
getCrawlDataStoreFactory
in interface ICrawlerConfig
public void setCrawlDataStoreFactory(ICrawlDataStoreFactory crawlDataStoreFactory)
public ICrawlerEventListener[] getCrawlerListeners()
ICrawlerConfig
getCrawlerListeners
in interface ICrawlerConfig
public void setCrawlerListeners(ICrawlerEventListener... crawlerListeners)
public ISpoiledReferenceStrategizer getSpoiledReferenceStrategizer()
ICrawlerConfig
getSpoiledReferenceStrategizer
in interface ICrawlerConfig
public void setSpoiledReferenceStrategizer(ISpoiledReferenceStrategizer spoiledReferenceStrategizer)
public IReferenceFilter[] getReferenceFilters()
getReferenceFilters
in interface ICrawlerConfig
public void setReferenceFilters(IReferenceFilter... referenceFilters)
referenceFilters
- the referenceFilters to setpublic IDocumentFilter[] getDocumentFilters()
ICrawlerConfig
getDocumentFilters
in interface ICrawlerConfig
public void setDocumentFilters(IDocumentFilter... documentfilters)
public IMetadataFilter[] getMetadataFilters()
ICrawlerConfig
getMetadataFilters
in interface ICrawlerConfig
public void setMetadataFilters(IMetadataFilter... metadataFilters)
public IDocumentChecksummer getDocumentChecksummer()
ICrawlerConfig
getDocumentChecksummer
in interface ICrawlerConfig
public void setDocumentChecksummer(IDocumentChecksummer documentChecksummer)
public ImporterConfig getImporterConfig()
ICrawlerConfig
getImporterConfig
in interface ICrawlerConfig
public void setImporterConfig(ImporterConfig importerConfig)
public ICommitter getCommitter()
ICrawlerConfig
getCommitter
in interface ICrawlerConfig
public void setCommitter(ICommitter committer)
public void saveToXML(Writer out) throws IOException
saveToXML
in interface IXMLConfigurable
IOException
protected abstract void saveCrawlerConfigToXML(Writer out) throws IOException
IOException
public final void loadFromXML(Reader in) throws IOException
loadFromXML
in interface IXMLConfigurable
IOException
protected abstract void loadCrawlerConfigFromXML(XMLConfiguration xml) throws IOException
IOException
protected void writeObject(Writer out, String tagName, Object object) throws IOException
IOException
protected void writeObject(Writer out, String tagName, Object object, boolean ignore) throws IOException
IOException
protected void writeArray(Writer out, String listTagName, String objectTagName, Object[] array) throws IOException
IOException
protected <T> T[] defaultIfEmpty(T[] array, T[] defaultArray)
Copyright © 2014–2021 Norconex Inc.. All rights reserved.