metodo - Code Search

fess-crawler/src/main/java/org/codelibs/fess/crawler/util/CrawlingParameterUtil.java

import org.codelibs.fess.crawler.service.DataService;
import org.codelibs.fess.crawler.service.UrlQueueService;

/**
 * Utility class for managing crawling parameters using ThreadLocal variables.
 * This class provides methods to set and get various parameters related to the crawling process.
 *
 * <p>This class is final and cannot be instantiated.</p>
 *
 * <p>The following parameters are managed:</p>
 * <ul>

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sat Mar 15 06:52:00 UTC 2025

- 6.4K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/helper/impl/MimeTypeHelperImpl.java

/**
 * MimeTypeHelperImpl is a helper class that detects the MIME type of a given input stream or filename.
 * It uses the Apache Tika library to detect the MIME type.
 *
 * <p>
 * This class provides methods to:
 * </p>
 * <ul>
 *   <li>Detect the MIME type based on the input stream and filename.</li>
 *   <li>Normalize the filename to handle special characters.</li>

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 6.5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/rule/impl/RegexRule.java

 * only one of them (allRequired = false). It also supports a default rule that always matches.
 *
 * <p>
 * The class uses a map of field names to Pattern objects to store the regular expressions.
 * The match method extracts the values of the specified fields from the ResponseData and
 * applies the corresponding regular expressions.
 * </p>
 *
 * <p>
 * Example usage:
 * </p>
 *
 * <pre>
 * {@code

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 6.2K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/CrawlerContext.java

/**
 * The {@link CrawlerContext} class holds the context information for a crawler execution.
 * It contains various attributes related to the crawler's state, configuration, and runtime data.
 * This class provides methods to access and modify these attributes, allowing for control and monitoring
 * of the crawler's behavior.
 *
 * <p>
 * The context includes information such as the session ID, active thread count, access count, crawler status,

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 8.9K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/service/impl/UrlQueueServiceImpl.java

import org.codelibs.fess.crawler.service.UrlQueueService;

import jakarta.annotation.Resource;

/**
 * Implementation of the {@link UrlQueueService} interface.
 * This class provides methods for managing a queue of URLs to be crawled,
 * including adding, deleting, and retrieving URLs from the queue.
 * It uses a {@link MemoryDataHelper} to store the URL queue data in memory.
 *
 * <p>

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 9.3K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/java/org/codelibs/fess/crawler/helper/impl/MimeTypeHelperImplTest.java

        assertContentType("application/zip", "extractor/zip/test.zip", "hoge.zip");
        assertContentType("application/x-lharc", "extractor/lha/test.lzh", "hoge.lzh"); // TODO is it correct?

        assertContentType("application/xml", "extractor/test.mm", "hoge.mm");

        assertContentType("message/rfc822", "extractor/eml/sample1.eml", "sample1.eml");

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sat Mar 15 06:52:00 UTC 2025

- 11.6K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/transformer/impl/HtmlTransformer.java

 *   <li>Extracting child URLs from the HTML content based on configured rules.</li>
 *   <li>Handling redirect URLs specified in the response headers.</li>
 * </ol>
 * <p>
 * The class also provides methods for configuring features and properties of the
 * underlying DOM parser, as well as defining rules for extracting child URLs
 * from specific HTML tags and attributes.
 * </p>
 *
 * <p>
 * <b>Configuration:</b>

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 28.5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RobotsTxt.java

        if (directive == null) {
            return 0;
        }
        return directive.getCrawlDelay();
    }

    /**
     * Returns the most specific directive matching the given user agent.
     * The method finds the longest matching user agent pattern in the directives,
     * excluding the general "*" pattern which matches all bots.
     *
     * @param userAgent the user agent string to match against directives,

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 10K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/processor/impl/DefaultResponseProcessor.java

 * and {@link UrlQueue}. It also uses {@link CrawlingParameterUtil} to access services
 * like {@link UrlQueueService} and DataService, as well as the {@link CrawlerContext}.
 * </p>
 *
 * <p>
 * The class provides methods to check if a response is successful or not modified based on
 * configured HTTP status codes. It also handles the storage of child URLs found in the
 * response data, respecting the maximum depth and access count limits.
 * </p>

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Thu Aug 07 02:55:08 UTC 2025

- 12.5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/JodExtractorTest.java

        String content = extractData.getContent();
        CloseableUtil.closeQuietly(in);
        logger.info(content);
        assertTrue(content.contains("テスト"));
    }

    /*
     * TODO not work... public void test_getText_mswordx_as() { InputStream in =
     * ResourceUtil.getResourceAsStream("extractor/msoffice/test_as.docx");
     * Map<String, String> params = new HashMap<String, String>();

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sat Mar 15 06:52:00 UTC 2025

- 9.5K bytes

- Viewed (0)

Search Options