Search Options

Results per page
Sort
Preferred Languages
Advance

Results 1 - 10 of 130 for extraction (0.05 sec)

  1. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/PdfExtractor.java

     * document and includes it in the extraction result.
     *
     * <p>Features:
     * <ul>
     *   <li>Text extraction from PDF pages</li>
     *   <li>Embedded document extraction</li>
     *   <li>Annotation extraction (file attachments)</li>
     *   <li>Metadata extraction</li>
     *   <li>Password-protected PDF support</li>
     *   <li>Configurable timeout for extraction process</li>
     * </ul>
     *
     * @author shinsuke
     */
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Sun Nov 23 12:19:14 UTC 2025
    - 12.8K bytes
    - Viewed (0)
  2. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/ExtractorBuilder.java

     * It encapsulates the process of extracting data from an input stream using a specified or detected extractor.
     * The builder allows setting parameters such as MIME type, filename, extractor name, maximum content length,
     * and cache file size to optimize the extraction process.
     *
     * <p>
     * The main purpose of this class is to simplify the extraction process by providing a fluent interface
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 10.1K bytes
    - Viewed (0)
  3. fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/TextExtractorEnhancedTest.java

                assertTrue("Error message should indicate extraction failure", e.getMessage().contains("Failed to extract"));
            } finally {
                // Reset to default encoding
                textExtractor.setEncoding("UTF-8");
            }
        }
    
        /**
         * Test extraction with empty input stream.
         */
        public void test_getText_emptyInputStream_returnsEmptyContent() {
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Mon Nov 24 03:59:47 UTC 2025
    - 8.9K bytes
    - Viewed (0)
  4. fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/FilenameExtractorEnhancedTest.java

        }
    
        /**
         * Test extraction with null parameters map.
         */
        public void test_getText_withNullParams() {
            final InputStream in = new ByteArrayInputStream(new byte[0]);
    
            final ExtractData result = filenameExtractor.getText(in, null);
    
            assertNotNull(result);
            assertEquals("", result.getContent());
        }
    
        /**
         * Test extraction with empty parameters map.
         */
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Mon Nov 24 03:59:47 UTC 2025
    - 7K bytes
    - Viewed (0)
  5. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/JsonExtractor.java

     * Extracts text content and metadata from JSON files.
     * This extractor provides better structured data extraction compared to Tika's generic text extraction.
     *
     * <p>Features:
     * <ul>
     *   <li>Structured text extraction with key-value pairs</li>
     *   <li>Top-level field extraction as metadata</li>
     *   <li>Nested structure flattening with configurable depth</li>
     *   <li>Array element extraction</li>
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Sun Nov 23 03:46:53 UTC 2025
    - 9.7K bytes
    - Viewed (0)
  6. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MarkdownExtractor.java

    /**
     * Extracts text content and metadata from Markdown files.
     * This extractor provides better structured data extraction compared to Tika's generic text extraction.
     *
     * <p>Features:
     * <ul>
     *   <li>YAML front matter metadata extraction</li>
     *   <li>Heading structure extraction</li>
     *   <li>Link URL extraction</li>
     *   <li>Code block content extraction</li>
     *   <li>Clean text conversion from Markdown</li>
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Sun Nov 23 03:46:53 UTC 2025
    - 8.2K bytes
    - Viewed (0)
  7. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractXmlExtractor.java

        /**
         * Default character encoding for content extraction.
         */
        protected String encoding = Constants.UTF_8;
    
        /**
         * The preload size for charset detection.
         */
        protected int preloadSizeForCharset = 2048;
    
        /**
         * Indicates whether comment tags should be ignored during extraction.
         */
        protected boolean ignoreCommentTag = false;
    
        /**
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Sun Nov 23 12:19:14 UTC 2025
    - 8.6K bytes
    - Viewed (0)
  8. fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ExtractorResourceManagementTest.java

         */
        public void test_MsWordExtractor_closesResourcesOnSuccess() throws IOException {
            final MsWordExtractor extractor = container.getComponent("msWordExtractor");
            final AtomicBoolean streamClosed = new AtomicBoolean(false);
    
            try (final InputStream originalStream = ResourceUtil.getResourceAsStream("extractor/msoffice/test.doc")) {
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Mon Nov 24 03:59:47 UTC 2025
    - 10.4K bytes
    - Viewed (0)
  9. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java

    import org.codelibs.fess.crawler.extractor.Extractor;
    import org.codelibs.fess.crawler.extractor.ExtractorFactory;
    import org.codelibs.fess.crawler.helper.MimeTypeHelper;
    import org.codelibs.fess.crawler.util.IgnoreCloseInputStream;
    
    import jp.gr.java_conf.dangan.util.lha.LhaFile;
    import jp.gr.java_conf.dangan.util.lha.LhaHeader;
    
    /**
     * Extractor implementation for LHA (LZH) archive files.
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Sun Nov 23 12:19:14 UTC 2025
    - 5.9K bytes
    - Viewed (0)
  10. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/Extractor.java

     */
    package org.codelibs.fess.crawler.extractor;
    
    import java.io.InputStream;
    import java.util.Map;
    
    import org.codelibs.fess.crawler.entity.ExtractData;
    
    /**
     * The Extractor interface defines methods for extracting text data from an input stream.
     * Implementations of this interface should provide the logic for extracting text and
     * optionally override the default weight value.
     */
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Sat Mar 15 06:52:00 UTC 2025
    - 1.6K bytes
    - Viewed (0)
Back to top