public abstract class Crawler extends Object
Abstract crawler implementation providing a common base to building crawlers.
As of 1.6.1, JMX support is disabled by default. To enable it,
set the system property "enableJMX" to true
. You can do so
by adding this to your Java launch command:
-DenableJMX=true
CrawlerConfig
Modifier and Type | Class and Description |
---|---|
protected static class |
Crawler.ReferenceProcessStatus |
Constructor and Description |
---|
Crawler(CrawlerConfig config,
Collector collector)
Constructor.
|
Modifier and Type | Method and Description |
---|---|
protected abstract void |
afterCrawlerExecution()
Gives crawler implementations a chance to do something right after
the crawler is done processing its last reference, before all resources
are shut down.
|
protected abstract void |
beforeCrawlerExecution(boolean resume)
Gives crawler implementations a chance to prepare before execution starts
Invoked right after the
CrawlerEvent.CRAWLER_RUN_BEGIN is fired. |
protected void |
beforeFinalizeDocumentProcessing(CrawlDoc doc)
Gives implementors a change to take action on a document before
its processing is being finalized (cycle end-of-life for a crawled
reference).
|
void |
clean() |
protected abstract CrawlDocInfo |
createChildDocInfo(String embeddedReference,
CrawlDocInfo parentCrawlRef) |
protected void |
deleteCacheOrphans() |
protected void |
destroyCrawler() |
protected void |
doExecute() |
protected abstract void |
executeCommitterPipeline(Crawler crawler,
CrawlDoc doc) |
protected abstract ImporterResponse |
executeImporterPipeline(ImporterPipelineContext context) |
protected abstract void |
executeQueuePipeline(CrawlDocInfo ref) |
Path |
exportDataStore(Path dir) |
Collector |
getCollector() |
CrawlerCommitterService |
getCommitterService() |
protected Class<? extends CrawlDocInfo> |
getCrawlDocInfoType() |
CrawlerConfig |
getCrawlerConfig()
Gets the crawler configuration.
|
IDataStoreEngine |
getDataStoreEngine() |
CrawlDocInfoService |
getDocInfoService() |
Path |
getDownloadDir() |
EventManager |
getEventManager()
Gets the event manager.
|
String |
getId() |
Importer |
getImporter()
Gets the crawler Importer module.
|
CrawlerMonitor |
getMonitor() |
CachedStreamFactory |
getStreamFactory() |
Path |
getTempDir()
Gets the directory where most temporary files are created for the
duration of a crawling session.
|
Path |
getWorkDir()
Gets the directory where files needing to be persisted between
crawling sessions are kept.
|
protected void |
handleOrphans() |
void |
importDataStore(Path inFile) |
protected void |
initCrawlDoc(CrawlDoc document) |
protected boolean |
initCrawler() |
protected boolean |
isMaxDocuments() |
protected boolean |
isQueueInitialized() |
boolean |
isStopped()
Whether the crawler job was stopped.
|
protected abstract void |
markReferenceVariationsAsProcessed(CrawlDocInfo crawlRef) |
protected Crawler.ReferenceProcessStatus |
processNextReference(com.norconex.collector.core.crawler.Crawler.ProcessFlags flags) |
protected void |
processReferences(com.norconex.collector.core.crawler.Crawler.ProcessFlags flags) |
protected void |
reprocessCacheOrphans() |
void |
start()
Starts crawling.
|
void |
stop() |
String |
toString() |
public Crawler(CrawlerConfig config, Collector collector)
config
- crawler configurationcollector
- the collector this crawler is attached topublic EventManager getEventManager()
public CrawlerMonitor getMonitor()
public CrawlerCommitterService getCommitterService()
public String getId()
public boolean isStopped()
true
if stoppedpublic void stop()
public Importer getImporter()
public CachedStreamFactory getStreamFactory()
public CrawlerConfig getCrawlerConfig()
public Collector getCollector()
public Path getWorkDir()
null
public Path getTempDir()
null
public Path getDownloadDir()
public void start()
protected boolean initCrawler()
protected Class<? extends CrawlDocInfo> getCrawlDocInfoType()
public IDataStoreEngine getDataStoreEngine()
public CrawlDocInfoService getDocInfoService()
public void clean()
public void importDataStore(Path inFile)
protected void destroyCrawler()
protected abstract void beforeCrawlerExecution(boolean resume)
CrawlerEvent.CRAWLER_RUN_BEGIN
is fired.
This method is different than the initCrawler()
method, which
is invoked for any type of actions where as this one is only invoked
before an effective request for crawling.resume
- whether the crawl is resuming from an unfinished session.protected abstract void afterCrawlerExecution()
CrawlerEvent.CRAWLER_STOP_END
or
CrawlerEvent.CRAWLER_RUN_END
(depending which of the two is
triggered).protected void doExecute()
protected void handleOrphans()
protected boolean isMaxDocuments()
protected void reprocessCacheOrphans()
protected abstract void executeQueuePipeline(CrawlDocInfo ref)
protected void deleteCacheOrphans()
protected void processReferences(com.norconex.collector.core.crawler.Crawler.ProcessFlags flags)
protected Crawler.ReferenceProcessStatus processNextReference(com.norconex.collector.core.crawler.Crawler.ProcessFlags flags)
protected void initCrawlDoc(CrawlDoc document)
protected void beforeFinalizeDocumentProcessing(CrawlDoc doc)
doc
- the documentprotected abstract void markReferenceVariationsAsProcessed(CrawlDocInfo crawlRef)
protected abstract CrawlDocInfo createChildDocInfo(String embeddedReference, CrawlDocInfo parentCrawlRef)
protected abstract ImporterResponse executeImporterPipeline(ImporterPipelineContext context)
protected abstract void executeCommitterPipeline(Crawler crawler, CrawlDoc doc)
protected boolean isQueueInitialized()
Copyright © 2014–2023 Norconex Inc.. All rights reserved.