Search Options

Results per page
Sort
Preferred Languages
Advance

Results 1 - 9 of 9 for crawl (0.01 sec)

  1. fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RobotsTxt.java

        }
    
        /**
         * Gets the crawl delay value for the specified user agent from robots.txt.
         * The crawl delay specifies the time (in seconds) to wait between successive requests.
         *
         * @param userAgent The user agent string to match against robots.txt directives
         * @return The crawl delay value in seconds. Returns 0 if no matching directive is found
         *         or no crawl delay is specified for the matching directive.
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 10K bytes
    - Viewed (0)
  2. fess-crawler/src/test/resources/org/codelibs/fess/crawler/helper/robots.txt

    User-agent: FessCrawler
    Disallow:           # allows all 
    
    User-agent: BruteBot
    Disallow: /
    Allow: /foo/bar/
    Crawl-delay: 1314000
    
    # welcome!
    User-agent: Googlebot
    Crawl-delay: 1
    
    User-agent: *
    Disallow: /private/
    Disallow: /help        # disallows /help.html, /help/index.html, etc.
    Allow: /help/faq.html
    Crawl-delay: 3
    
    User-agent: Crawler
    Disallow: /aaa
    
    User-agent: Crawler/1.0
    Disallow: /bbb
    
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Oct 11 02:16:55 UTC 2015
    - 566 bytes
    - Viewed (0)
  3. fess-crawler/src/main/java/org/codelibs/fess/crawler/helper/RobotsTxtHelper.java

    import org.codelibs.fess.crawler.exception.RobotsTxtException;
    
    /**
     * Robots.txt Specifications:
     * <ul>
     * <li><a href=
     * "https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt"
     * >https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
     * </a></li>
     * </ul>
     *
     * @author bowez
     * @author shinsuke
     *
     */
    public class RobotsTxtHelper {
    
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 7.7K bytes
    - Viewed (0)
  4. README.md

    </components>
    ```
    
    ### Crawler Context Configuration
    
    ```java
    // Set maximum number of URLs to crawl
    crawler.crawlerContext.setMaxAccessCount(1000);
    
    // Set number of crawler threads
    crawler.crawlerContext.setNumOfThread(10);
    
    // Set maximum crawl depth
    crawler.crawlerContext.setMaxDepth(3);
    
    // Set request interval (politeness)
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Aug 31 05:32:52 UTC 2025
    - 15.3K bytes
    - Viewed (0)
  5. fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapUrl.java

         * command. Even though search engine crawlers may consider this information
         * when making decisions, they may crawl pages marked "hourly" less
         * frequently than that, and they may crawl pages marked "yearly" more
         * frequently than that. Crawlers may periodically crawl pages marked
         * "never" so that they can handle unexpected changes to those pages.
         */
        private String changefreq;
    
        /**
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 6.5K bytes
    - Viewed (0)
  6. fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/ResultData.java

    import java.util.LinkedHashSet;
    import java.util.Set;
    import java.util.function.Function;
    
    import org.codelibs.fess.crawler.exception.CrawlerSystemException;
    
    /**
     * This class represents the result data of a crawl.
     */
    public class ResultData implements Serializable {
        private static final long serialVersionUID = 1L;
    
        /** The name of the transformer. */
        protected String transformerName;
    
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 4.7K bytes
    - Viewed (0)
  7. fess-crawler-opensearch/src/main/java/org/codelibs/fess/crawler/entity/OpenSearchAccessResult.java

         */
        private boolean initializedData = false;
    
        /**
         * Initializes the access result with response data and result data.
         *
         * @param responseData The response data from the crawl operation.
         * @param resultData The result data from content processing.
         */
        @Override
        public void init(final ResponseData responseData, final ResultData resultData) {
    
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 6.5K bytes
    - Viewed (0)
  8. fess-crawler/src/main/java/org/codelibs/fess/crawler/client/fs/FileSystemClient.java

    import org.codelibs.fess.crawler.helper.ContentLengthHelper;
    import org.codelibs.fess.crawler.helper.MimeTypeHelper;
    
    import jakarta.annotation.Resource;
    
    /**
     * FileSystemClient is CrawlerClient implementation to crawl files on a file
     * system.
     *
     * @author shinsuke
     *
     */
    public class FileSystemClient extends AbstractCrawlerClient {
    
        /** Logger instance for this class */
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 13.8K bytes
    - Viewed (0)
  9. fess-crawler/src/main/java/org/codelibs/fess/crawler/CrawlerThread.java

         */
        protected CrawlerClient getClient(final String url) {
            return clientFactory.getClient(url);
        }
    
        /**
         * Checks if the content has been updated since the last crawl.
         * @param client The crawler client.
         * @param urlQueue The URL queue entry.
         * @return true if content is updated, false otherwise.
         */
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Thu Aug 07 02:55:08 UTC 2025
    - 20.4K bytes
    - Viewed (0)
Back to top