public class HttpCrawlerConfig extends AbstractCrawlerConfig
ICrawlerConfig.OrphansStrategy
Constructor and Description |
---|
HttpCrawlerConfig() |
Modifier and Type | Method and Description |
---|---|
boolean |
equals(Object other) |
ICanonicalLinkDetector |
getCanonicalLinkDetector()
Gets the canonical link detector.
|
IDelayResolver |
getDelayResolver() |
IHttpDocumentFetcher |
getDocumentFetcher() |
IHttpClientFactory |
getHttpClientFactory() |
ILinkExtractor[] |
getLinkExtractors() |
int |
getMaxDepth() |
IMetadataChecksummer |
getMetadataChecksummer()
Gets the metadata checksummer.
|
IHttpMetadataFetcher |
getMetadataFetcher() |
IHttpDocumentProcessor[] |
getPostImportProcessors() |
IHttpDocumentProcessor[] |
getPreImportProcessors() |
IRecrawlableResolver |
getRecrawlableResolver()
Gets the recrawlable resolver.
|
IRedirectURLProvider |
getRedirectURLProvider()
Gets the redirect URL provider.
|
IRobotsMetaProvider |
getRobotsMetaProvider() |
IRobotsTxtProvider |
getRobotsTxtProvider() |
ISitemapResolverFactory |
getSitemapResolverFactory() |
String[] |
getStartSitemapURLs()
Gets sitemap URLs to be used as starting points for crawling.
|
String[] |
getStartURLs() |
String[] |
getStartURLsFiles()
Gets the file paths of seed files containing URLs to be used as
"start URLs".
|
IStartURLsProvider[] |
getStartURLsProviders()
Gets the providers of URLs used as starting points for crawling.
|
URLCrawlScopeStrategy |
getURLCrawlScopeStrategy()
Gets the strategy to use to determine if a URL is in scope.
|
IURLNormalizer |
getUrlNormalizer() |
String |
getUserAgent() |
int |
hashCode() |
boolean |
isIgnoreCanonicalLinks()
Whether canonical links found in HTTP headers and in HTML files
<head> section should be ignored or processed.
|
boolean |
isIgnoreRobotsMeta() |
boolean |
isIgnoreRobotsTxt() |
boolean |
isIgnoreSitemap()
Whether to ignore sitemap detection and resolving for URLs processed.
|
boolean |
isKeepDownloads() |
boolean |
isKeepMaxDepthLinks()
Gets whether to keep (and extract) links on pages having reached
the configured maximum depth.
|
boolean |
isKeepOutOfScopeLinks()
Whether links not in scope should be stored as metadata
under
HttpMetadata.COLLECTOR_REFERENCED_URLS_OUT_OF_SCOPE |
boolean |
isSkipMetaFetcherOnBadStatus()
Gets whether to skip metadata fetching activities instead of
rejecting a document on bad status.
|
protected void |
loadCrawlerConfigFromXML(XMLConfiguration xml) |
protected void |
saveCrawlerConfigToXML(Writer out) |
void |
setCanonicalLinkDetector(ICanonicalLinkDetector canonicalLinkDetector)
Sets the canonical link detector.
|
void |
setDelayResolver(IDelayResolver delayResolver) |
void |
setDocumentFetcher(IHttpDocumentFetcher documentFetcher) |
void |
setHttpClientFactory(IHttpClientFactory httpClientFactory) |
void |
setIgnoreCanonicalLinks(boolean ignoreCanonicalLinks)
Sets whether canonical links found in HTTP headers and in HTML files
<head> section should be ignored or processed.
|
void |
setIgnoreRobotsMeta(boolean ignoreRobotsMeta) |
void |
setIgnoreRobotsTxt(boolean ignoreRobotsTxt) |
void |
setIgnoreSitemap(boolean ignoreSitemap)
Sets whether to ignore sitemap detection and resolving for URLs
processed.
|
void |
setKeepDownloads(boolean keepDownloads) |
void |
setKeepMaxDepthLinks(boolean keepMaxDepthLinks)
Sets whether to keep (and extract) links on pages having reached
the configured maximum depth.
|
void |
setKeepOutOfScopeLinks(boolean keepOutOfScopeLinks)
Sets whether links not in scope should be stored as metadata
under
HttpMetadata.COLLECTOR_REFERENCED_URLS_OUT_OF_SCOPE |
void |
setLinkExtractors(ILinkExtractor... linkExtractors) |
void |
setMaxDepth(int depth) |
void |
setMetadataChecksummer(IMetadataChecksummer metadataChecksummer) |
void |
setMetadataFetcher(IHttpMetadataFetcher metadataFetcher) |
void |
setPostImportProcessors(IHttpDocumentProcessor... httpPostProcessors) |
void |
setPreImportProcessors(IHttpDocumentProcessor... httpPreProcessors) |
void |
setRecrawlableResolver(IRecrawlableResolver recrawlableResolver)
Sets the recrawlable resolver.
|
void |
setRedirectURLProvider(IRedirectURLProvider redirectURLProvider)
Sets the redirect URL provider
|
void |
setRobotsMetaProvider(IRobotsMetaProvider robotsMetaProvider) |
void |
setRobotsTxtProvider(IRobotsTxtProvider robotsTxtProvider) |
void |
setSitemapResolverFactory(ISitemapResolverFactory sitemapResolverFactory) |
void |
setSkipMetaFetcherOnBadStatus(boolean skipMetaFetcherOnBadStatus)
Sets whether to skip metadata fetching activities instead of
rejecting a document on bad status.
|
void |
setStartSitemapURLs(String... startSitemapURLs)
Sets the sitemap URLs used as starting points for crawling.
|
void |
setStartURLs(String... startURLs) |
void |
setStartURLsFiles(String... startURLsFiles)
Sets the file paths of seed files containing URLs to be used as
"start URLs".
|
void |
setStartURLsProviders(IStartURLsProvider... startURLsProviders)
Sets the providers of URLs used as starting points for crawling.
|
void |
setUrlCrawlScopeStrategy(URLCrawlScopeStrategy urlCrawlScopeStrategy)
Sets the strategy to use to determine if a URL is in scope.
|
void |
setUrlNormalizer(IURLNormalizer urlNormalizer) |
void |
setUserAgent(String userAgent) |
String |
toString() |
defaultIfEmpty, getCommitter, getCrawlDataStoreFactory, getCrawlerListeners, getDocumentChecksummer, getDocumentFilters, getId, getImporterConfig, getMaxDocuments, getMetadataFilters, getNumThreads, getOrphansStrategy, getReferenceFilters, getSpoiledReferenceStrategizer, getStopOnExceptions, getWorkDir, loadFromXML, saveToXML, setCommitter, setCrawlDataStoreFactory, setCrawlerListeners, setDocumentChecksummer, setDocumentFilters, setId, setImporterConfig, setMaxDocuments, setMetadataFilters, setNumThreads, setOrphansStrategy, setReferenceFilters, setSpoiledReferenceStrategizer, setStopOnExceptions, setWorkDir, writeArray, writeObject, writeObject
public String[] getStartURLs()
public void setStartURLs(String... startURLs)
public String[] getStartURLsFiles()
public void setStartURLsFiles(String... startURLsFiles)
startURLsFiles
- file paths of seed files containing URLspublic String[] getStartSitemapURLs()
public void setStartSitemapURLs(String... startSitemapURLs)
startSitemapURLs
- sitemap URLspublic IStartURLsProvider[] getStartURLsProviders()
public void setStartURLsProviders(IStartURLsProvider... startURLsProviders)
startURLsProviders
- start URL providerpublic void setMaxDepth(int depth)
public int getMaxDepth()
public IHttpClientFactory getHttpClientFactory()
public void setHttpClientFactory(IHttpClientFactory httpClientFactory)
public IHttpDocumentFetcher getDocumentFetcher()
public void setDocumentFetcher(IHttpDocumentFetcher documentFetcher)
public IHttpMetadataFetcher getMetadataFetcher()
public void setMetadataFetcher(IHttpMetadataFetcher metadataFetcher)
public ICanonicalLinkDetector getCanonicalLinkDetector()
null
if none
are defined.public void setCanonicalLinkDetector(ICanonicalLinkDetector canonicalLinkDetector)
null
argument, or invoke
setIgnoreCanonicalLinks(boolean)
with a true
value.canonicalLinkDetector
- the canonical link detectorpublic ILinkExtractor[] getLinkExtractors()
public void setLinkExtractors(ILinkExtractor... linkExtractors)
public IRobotsTxtProvider getRobotsTxtProvider()
public void setRobotsTxtProvider(IRobotsTxtProvider robotsTxtProvider)
public IURLNormalizer getUrlNormalizer()
public void setUrlNormalizer(IURLNormalizer urlNormalizer)
public IDelayResolver getDelayResolver()
public void setDelayResolver(IDelayResolver delayResolver)
public IHttpDocumentProcessor[] getPreImportProcessors()
public void setPreImportProcessors(IHttpDocumentProcessor... httpPreProcessors)
public IHttpDocumentProcessor[] getPostImportProcessors()
public void setPostImportProcessors(IHttpDocumentProcessor... httpPostProcessors)
public boolean isIgnoreRobotsTxt()
public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt)
public boolean isKeepDownloads()
public void setKeepDownloads(boolean keepDownloads)
public boolean isKeepMaxDepthLinks()
true
if keeping max depth links.getMaxDepth()
public void setKeepMaxDepthLinks(boolean keepMaxDepthLinks)
keepMaxDepthLinks
- true
to keep max depth links.setMaxDepth(int)
public boolean isKeepOutOfScopeLinks()
HttpMetadata.COLLECTOR_REFERENCED_URLS_OUT_OF_SCOPE
true
if keeping URLs not in scope.public void setKeepOutOfScopeLinks(boolean keepOutOfScopeLinks)
HttpMetadata.COLLECTOR_REFERENCED_URLS_OUT_OF_SCOPE
keepOutOfScopeLinks
- true
if keeping URLs not in scopepublic IMetadataChecksummer getMetadataChecksummer()
LastModifiedMetadataChecksummer
(since 2.2.0).public void setMetadataChecksummer(IMetadataChecksummer metadataChecksummer)
public boolean isIgnoreRobotsMeta()
public void setIgnoreRobotsMeta(boolean ignoreRobotsMeta)
public IRobotsMetaProvider getRobotsMetaProvider()
public void setRobotsMetaProvider(IRobotsMetaProvider robotsMetaProvider)
public boolean isIgnoreSitemap()
getStartSitemapURLs()
) are never ignored.true
to ignore sitemapspublic void setIgnoreSitemap(boolean ignoreSitemap)
getStartSitemapURLs()
) are never ignored.ignoreSitemap
- true
to ignore sitemapspublic ISitemapResolverFactory getSitemapResolverFactory()
public void setSitemapResolverFactory(ISitemapResolverFactory sitemapResolverFactory)
public String getUserAgent()
public void setUserAgent(String userAgent)
public boolean isIgnoreCanonicalLinks()
true
if ignoring canonical linkspublic void setIgnoreCanonicalLinks(boolean ignoreCanonicalLinks)
true
URL pages with a canonical URL pointer in them are not
processed.ignoreCanonicalLinks
- true
if ignoring canonical linkspublic URLCrawlScopeStrategy getURLCrawlScopeStrategy()
public void setUrlCrawlScopeStrategy(URLCrawlScopeStrategy urlCrawlScopeStrategy)
urlCrawlScopeStrategy
- strategy to usepublic IRedirectURLProvider getRedirectURLProvider()
public void setRedirectURLProvider(IRedirectURLProvider redirectURLProvider)
redirectURLProvider
- redirect URL providerpublic IRecrawlableResolver getRecrawlableResolver()
public void setRecrawlableResolver(IRecrawlableResolver recrawlableResolver)
recrawlableResolver
- the recrawlable resolverpublic boolean isSkipMetaFetcherOnBadStatus()
true
if skippingpublic void setSkipMetaFetcherOnBadStatus(boolean skipMetaFetcherOnBadStatus)
true
, upon
receiving a bad HTTP status code, activities such as metadata filtering,
canonical URL resolution and metadata checksum creation are all skipped.
When applicable, those activites will be performed after the document
fetcher also had a chance to download metadata. Setting this flag to
true
can be useful when the HTTP HEAD method is not
supported by some sites or pages.skipMetaFetcherOnBadStatus
- true
if skippingprotected void saveCrawlerConfigToXML(Writer out) throws IOException
saveCrawlerConfigToXML
in class AbstractCrawlerConfig
IOException
protected void loadCrawlerConfigFromXML(XMLConfiguration xml)
loadCrawlerConfigFromXML
in class AbstractCrawlerConfig
public boolean equals(Object other)
equals
in class AbstractCrawlerConfig
public int hashCode()
hashCode
in class AbstractCrawlerConfig
public String toString()
toString
in class AbstractCrawlerConfig
Copyright © 2009–2021 Norconex Inc.. All rights reserved.