public abstract class AbstractCrawler extends AbstractResumableJob implements ICrawler
Abstract crawler implementation providing a common base to building crawlers.
As of 1.6.1, JMX support is disabled by default. To enable it,
set the system property "enableJMX" to true
. You can do so
by adding this to your Java launch command:
-DenableJMX=true
Modifier and Type | Class and Description |
---|---|
class |
AbstractCrawler.CopyIfNullBeanUtilsBean |
Constructor and Description |
---|
AbstractCrawler(ICrawlerConfig config)
Constructor.
|
Modifier and Type | Method and Description |
---|---|
protected void |
beforeFinalizeDocumentProcessing(BaseCrawlData crawlData,
ICrawlDataStore store,
ImporterDocument doc,
ICrawlData cachedCrawlData)
Gives implementors a change to take action on a document before
its processing is being finalized (cycle end-of-life for a crawled
reference).
|
protected abstract void |
cleanupExecution(JobStatusUpdater statusUpdater,
JobSuite suite,
ICrawlDataStore refStore) |
protected ICrawlDataStore |
createCrawlDataStore(boolean resume) |
protected abstract BaseCrawlData |
createEmbeddedCrawlData(String embeddedReference,
ICrawlData parentCrawlData) |
protected void |
deleteCacheOrphans(ICrawlDataStore crawlDataStore,
JobStatusUpdater statusUpdater,
JobSuite suite) |
protected void |
execute(JobStatusUpdater statusUpdater,
JobSuite suite,
ICrawlDataStore crawlDataStore) |
protected abstract void |
executeCommitterPipeline(ICrawler crawler,
ImporterDocument doc,
ICrawlDataStore crawlDataStore,
BaseCrawlData crawlData,
BaseCrawlData cachedCrawlData) |
protected abstract ImporterResponse |
executeImporterPipeline(ImporterPipelineContext context) |
protected abstract void |
executeQueuePipeline(ICrawlData crawlData,
ICrawlDataStore crawlDataStore) |
void |
fireCrawlerEvent(String eventType,
ICrawlData crawlData,
Object subject) |
File |
getBaseDownloadDir() |
ICrawlerConfig |
getCrawlerConfig()
Gets the crawler configuration
|
File |
getCrawlerDownloadDir() |
CrawlerEventManager |
getCrawlerEventManager()
Gets the crawler events manager.
|
String |
getId() |
Importer |
getImporter()
Gets the crawler Importer module.
|
CachedStreamFactory |
getStreamFactory() |
protected void |
handleOrphans(ICrawlDataStore crawlStore,
JobStatusUpdater statusUpdater,
JobSuite suite) |
protected void |
initCrawlData(ICrawlData crawlData,
ICrawlData cachedCrawlData,
ImporterDocument document) |
protected boolean |
isMaxDocuments() |
boolean |
isStopped()
Whether the crawler job was stopped.
|
protected abstract void |
markReferenceVariationsAsProcessed(BaseCrawlData crawlData,
ICrawlDataStore refStore) |
protected abstract void |
prepareExecution(JobStatusUpdater statusUpdater,
JobSuite suite,
ICrawlDataStore refStore,
boolean resume) |
protected boolean |
processNextReference(JobStatusUpdater statusUpdater,
ImporterPipelineContext context) |
protected void |
processReferences(JobStatusUpdater statusUpdater,
JobSuite suite,
ImporterPipelineContext contextPrototype) |
protected void |
reprocessCacheOrphans(ICrawlDataStore crawlDataStore,
JobStatusUpdater statusUpdater,
JobSuite suite) |
protected void |
resumeExecution(JobStatusUpdater statusUpdater,
JobSuite suite) |
protected void |
startExecution(JobStatusUpdater statusUpdater,
JobSuite suite) |
void |
stop(IJobStatus jobStatus,
JobSuite suite) |
protected abstract ImporterDocument |
wrapDocument(ICrawlData crawlData,
ImporterDocument document) |
execute
public AbstractCrawler(ICrawlerConfig config)
config
- crawler configurationpublic boolean isStopped()
true
if stoppedpublic void stop(IJobStatus jobStatus, JobSuite suite)
public Importer getImporter()
ICrawler
getImporter
in interface ICrawler
public CachedStreamFactory getStreamFactory()
public ICrawlerConfig getCrawlerConfig()
getCrawlerConfig
in interface ICrawler
public void fireCrawlerEvent(String eventType, ICrawlData crawlData, Object subject)
public File getBaseDownloadDir()
public File getCrawlerDownloadDir()
public CrawlerEventManager getCrawlerEventManager()
ICrawler
getCrawlerEventManager
in interface ICrawler
protected void startExecution(JobStatusUpdater statusUpdater, JobSuite suite)
startExecution
in class AbstractResumableJob
protected void resumeExecution(JobStatusUpdater statusUpdater, JobSuite suite)
resumeExecution
in class AbstractResumableJob
protected ICrawlDataStore createCrawlDataStore(boolean resume)
protected abstract void prepareExecution(JobStatusUpdater statusUpdater, JobSuite suite, ICrawlDataStore refStore, boolean resume)
protected abstract void cleanupExecution(JobStatusUpdater statusUpdater, JobSuite suite, ICrawlDataStore refStore)
protected void execute(JobStatusUpdater statusUpdater, JobSuite suite, ICrawlDataStore crawlDataStore)
protected void handleOrphans(ICrawlDataStore crawlStore, JobStatusUpdater statusUpdater, JobSuite suite)
protected boolean isMaxDocuments()
protected void reprocessCacheOrphans(ICrawlDataStore crawlDataStore, JobStatusUpdater statusUpdater, JobSuite suite)
protected abstract void executeQueuePipeline(ICrawlData crawlData, ICrawlDataStore crawlDataStore)
protected void deleteCacheOrphans(ICrawlDataStore crawlDataStore, JobStatusUpdater statusUpdater, JobSuite suite)
protected void processReferences(JobStatusUpdater statusUpdater, JobSuite suite, ImporterPipelineContext contextPrototype)
protected boolean processNextReference(JobStatusUpdater statusUpdater, ImporterPipelineContext context)
protected abstract ImporterDocument wrapDocument(ICrawlData crawlData, ImporterDocument document)
protected void initCrawlData(ICrawlData crawlData, ICrawlData cachedCrawlData, ImporterDocument document)
protected void beforeFinalizeDocumentProcessing(BaseCrawlData crawlData, ICrawlDataStore store, ImporterDocument doc, ICrawlData cachedCrawlData)
crawlData
- crawl data with data the crawler was able to obtain,
guaranteed to have a non-null statestore
- crawl storedoc
- the documentcachedCrawlData
- cached crawl data
(null
if document was not crawled before)protected abstract void markReferenceVariationsAsProcessed(BaseCrawlData crawlData, ICrawlDataStore refStore)
protected abstract BaseCrawlData createEmbeddedCrawlData(String embeddedReference, ICrawlData parentCrawlData)
protected abstract ImporterResponse executeImporterPipeline(ImporterPipelineContext context)
protected abstract void executeCommitterPipeline(ICrawler crawler, ImporterDocument doc, ICrawlDataStore crawlDataStore, BaseCrawlData crawlData, BaseCrawlData cachedCrawlData)
Copyright © 2014–2021 Norconex Inc.. All rights reserved.