crawling - Code Search

src/main/java/org/codelibs/fess/app/web/admin/crawlinginfo/EditForm.java

    /**
     * The unique identifier of the crawling information entry being edited.
     * This is a required field for identifying which crawling info to update.
     */
    @Required
    @Size(max = 1000)
    public String id;

    /**
     * The session identifier of the crawling session.
     * This is a required field that identifies the specific crawling session.
     * Maximum length is 20 characters.
     */

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Thu Jul 17 08:28:31 GMT 2025

- 2.7K bytes

- Click Count (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/exception/DataStoreCrawlingException.java

/**
 * Exception thrown when an error occurs during data store crawling operations.
 * This exception provides information about the URL where the error occurred
 * and whether the crawling process should be aborted.
 */
public class DataStoreCrawlingException extends CrawlingAccessException {

    private static final long serialVersionUID = 1L;

    /**
     * The URL where the crawling error occurred.
     */
    private final String url;

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Wed Nov 19 08:04:23 GMT 2025

- 2.7K bytes

- Click Count (0)

github.com/codelibs/fess-crawler

README.md

        
        // Configure crawling parameters
        crawler.addUrl("https://example.com");
        crawler.crawlerContext.setMaxAccessCount(100);
        crawler.crawlerContext.setNumOfThread(5);
        crawler.urlFilter.addInclude("https://example.com/.*");
        
        // Execute crawling
        String sessionId = crawler.execute();
        System.out.println("Crawling completed. Session ID: " + sessionId);
    }
}

Created: Sun Apr 12 03:50:13 GMT 2026

- Last Modified: Sun Aug 31 05:32:52 GMT 2025

- 15.3K bytes

- Click Count (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/exec/Crawler.java

 * <ul>
 * <li>Web crawling - crawls web sites and web content</li>
 * <li>File system crawling - crawls file systems and documents</li>
 * <li>Data store crawling - crawls databases and other data sources</li>
 * <li>Combined crawling - runs multiple crawling types simultaneously</li>
 * </ul>
 *
 * <p>Command line usage:
 * <pre>
 * java org.codelibs.fess.exec.Crawler [options...]

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Thu Mar 26 02:24:08 GMT 2026

- 32.4K bytes

- Click Count (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/helper/CrawlingConfigHelper.java

import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;

import jakarta.annotation.PostConstruct;

/**
 * Helper class for managing crawling configurations.
 * Provides functionality to store, retrieve, and manage different types of crawling configurations
 * including web, file, and data configurations. Supports caching and session-based configuration management.
 */
public class CrawlingConfigHelper {

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Fri Nov 28 16:29:12 GMT 2025

- 19.5K bytes

- Click Count (1)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/helper/CrawlingInfoHelper.java

    }

    /**
     * Stores crawling information and parameters for the specified session.
     * Creates a new crawling info record if none exists or if create flag is true.
     * Also stores any accumulated information parameters and clears the info map.
     *
     * @param sessionId the session ID for the crawling information
     * @param create if true, creates a new crawling info regardless of existing records

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Thu Aug 07 03:06:29 GMT 2025

- 15.2K bytes

- Click Count (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/helper/WebFsIndexHelper.java

            if (logger.isInfoEnabled()) {
                logger.info("No crawling target urls.");
            }
            return;
        }

        doCrawl(sessionId, webConfigList, fileConfigList);
    }

    /**
     * Performs the actual crawling operation for the provided configurations.
     *
     * @param sessionId The session ID for this crawling operation
     * @param webConfigList List of web configurations to crawl

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Fri Nov 28 16:29:12 GMT 2025

- 25K bytes

- Click Count (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/app/service/CrawlingInfoService.java

    }

    /**
     * Stores (inserts or updates) a crawling information record.
     * Sets up the store conditions including creation time if not already set,
     * then performs an insert or update operation with immediate refresh.
     *
     * @param crawlingInfo the crawling information entity to store
     * @throws FessSystemException if the crawling information is null
     */

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Thu Jul 17 08:28:31 GMT 2025

- 19.9K bytes

- Click Count (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/Crawler.java

import org.codelibs.fess.crawler.service.UrlQueueService;

import jakarta.annotation.Resource;

/**
 * The Crawler class is the main class for web crawling. It manages the crawling process,
 * including adding URLs to the queue, filtering URLs, managing crawler threads,
 * and handling the overall crawling lifecycle.
 *
 * <p>It implements the Runnable interface to be executed in a separate thread,

Created: Sun Apr 12 03:50:13 GMT 2026

- Last Modified: Mon Nov 24 03:59:47 GMT 2025

- 17K bytes

- Click Count (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/helper/DataIndexHelper.java

/**
 * Helper class for managing data crawling operations in Fess.
 * This class coordinates the execution of data store crawling processes,
 * managing multiple concurrent crawling threads and handling the indexing
 * of crawled documents into the search engine.
 *
 * <p>The DataIndexHelper supports:</p>
 * <ul>
 *   <li>Concurrent crawling of multiple data configurations</li>

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Fri Nov 28 16:29:12 GMT 2025

- 19K bytes

- Click Count (0)

Search Options