public class MongoCrawlDataStore extends AbstractCrawlDataStore
Mongo implementation of ICrawlDataStore
.
All references are stored in a collection with a default name of "references". They go from the "QUEUED", "ACTIVE" and "PROCESSED" stages.
The cached references are stored in a separated collection with the default name "cached".
As of 1.9.0, you can define your own collection names using one of the new constructors.
Modifier and Type | Field and Description |
---|---|
static String |
DEFAULT_CACHED_COL_NAME |
static String |
DEFAULT_REFERENCES_COL_NAME |
Constructor and Description |
---|
MongoCrawlDataStore(boolean resume,
com.mongodb.MongoClient client,
String dbName,
IMongoSerializer serializer)
Constructor.
|
MongoCrawlDataStore(boolean resume,
com.mongodb.MongoClient client,
String dbName,
IMongoSerializer serializer,
String referencesCollectionName,
String cachedCollectionName)
Constructor.
|
MongoCrawlDataStore(String crawlerId,
boolean resume,
MongoConnectionDetails conn,
IMongoSerializer serializer)
Constructor.
|
MongoCrawlDataStore(String crawlerId,
boolean resume,
MongoConnectionDetails conn,
IMongoSerializer serializer,
String referencesCollectionName,
String cachedCollectionName)
Constructor.
|
Modifier and Type | Method and Description |
---|---|
protected static com.mongodb.MongoClient |
buildMongoClient(String crawlerId,
MongoConnectionDetails connDetails) |
void |
close()
Closes a database connection.
|
protected void |
deleteReferences(String... stages) |
int |
getActiveCount()
Gets the number of active references (currently being processed).
|
ICrawlData |
getCached(String reference)
Gets the cached reference from previous time crawler was run
(e.g.
|
String |
getCachedCollectionName()
Gets the cached collection name.
|
Iterator<ICrawlData> |
getCacheIterator()
Gets the cache iterator.
|
ICrawlData |
getProcessed(String reference)
Gets an already processed reference from the current crawl session.
|
int |
getProcessedCount()
Gets the number of references processed.
|
int |
getQueueSize()
Gets the size of the reference queue (number of
references left to process).
|
String |
getReferencesCollectionName()
Gets the references collection name.
|
protected int |
getReferencesCount(IMongoSerializer.Stage stage) |
boolean |
isActive(String reference)
Whether the given reference is currently being processed (i.e.
|
boolean |
isCacheEmpty()
Whether there are any references the the cache from a previous crawler
run.
|
boolean |
isProcessed(String reference)
Whether the given reference has been processed.
|
boolean |
isQueued(String reference)
Whether the given reference is in the queue or not
(waiting to be processed).
|
boolean |
isQueueEmpty()
Whether there are any references to process in the queue.
|
protected boolean |
isStage(String reference,
IMongoSerializer.Stage stage) |
ICrawlData |
nextQueued()
Returns the next reference to be processed from the queue and marks it as
being "active" (i.e.
|
void |
processed(ICrawlData crawlData)
Marks this reference as processed.
|
void |
queue(ICrawlData crawlData)
Queues a reference for future processing.
|
public static final String DEFAULT_CACHED_COL_NAME
public static final String DEFAULT_REFERENCES_COL_NAME
public MongoCrawlDataStore(String crawlerId, boolean resume, MongoConnectionDetails conn, IMongoSerializer serializer)
crawlerId
- crawler idresume
- whether to resume an aborted jobserializer
- Mongo serializerconn
- Mongo connection detailspublic MongoCrawlDataStore(String crawlerId, boolean resume, MongoConnectionDetails conn, IMongoSerializer serializer, String referencesCollectionName, String cachedCollectionName)
crawlerId
- crawler idresume
- whether to resume an aborted jobserializer
- Mongo serializerconn
- Mongo connection detailsreferencesCollectionName
- name of Mongo references collectioncachedCollectionName
- name of Mongo cached collectionpublic MongoCrawlDataStore(boolean resume, com.mongodb.MongoClient client, String dbName, IMongoSerializer serializer)
resume
- whether to resume an aborted jobclient
- Mongo clientdbName
- Mongo database nameserializer
- Mongo serializerpublic MongoCrawlDataStore(boolean resume, com.mongodb.MongoClient client, String dbName, IMongoSerializer serializer, String referencesCollectionName, String cachedCollectionName)
resume
- whether to resume an aborted jobclient
- Mongo clientdbName
- Mongo database nameserializer
- Mongo serializerreferencesCollectionName
- name of Mongo references collectioncachedCollectionName
- name of Mongo cached collectionpublic String getReferencesCollectionName()
public String getCachedCollectionName()
protected static com.mongodb.MongoClient buildMongoClient(String crawlerId, MongoConnectionDetails connDetails)
public void queue(ICrawlData crawlData)
ICrawlDataStore
Queues a reference for future processing.
crawlData
- the reference to eventually be processedpublic boolean isQueueEmpty()
ICrawlDataStore
true
if the queue is emptypublic int getQueueSize()
ICrawlDataStore
public boolean isQueued(String reference)
ICrawlDataStore
reference
- the referencetrue
if the reference is in the queuepublic ICrawlData nextQueued()
ICrawlDataStore
public boolean isActive(String reference)
ICrawlDataStore
reference
- the referencetrue
if activepublic int getActiveCount()
ICrawlDataStore
public ICrawlData getCached(String reference)
ICrawlDataStore
reference
- reference cached from previous runpublic boolean isCacheEmpty()
ICrawlDataStore
true
if the cache is emptypublic void processed(ICrawlData crawlData)
ICrawlDataStore
crawlData
- processed referencepublic boolean isProcessed(String reference)
ICrawlDataStore
reference
- the referencetrue
if processedpublic ICrawlData getProcessed(String reference)
ICrawlDataStore
reference
- reference to getpublic int getProcessedCount()
ICrawlDataStore
protected void deleteReferences(String... stages)
protected int getReferencesCount(IMongoSerializer.Stage stage)
protected boolean isStage(String reference, IMongoSerializer.Stage stage)
public void close()
ICrawlDataStore
public Iterator<ICrawlData> getCacheIterator()
ICrawlDataStore
Copyright © 2014–2021 Norconex Inc.. All rights reserved.