Search Options

Results per page
Sort
Preferred Languages
Advance

Results 1 - 10 of 50 for extractors (0.06 sec)

  1. README.md

    ### Multi-Module Structure
    
    ```
    fess-crawler-parent/
    ├── fess-crawler/              # Core crawler framework
    │   ├── client/               # Protocol clients (HTTP, FTP, SMB, etc.)
    │   ├── extractor/           # Content extractors
    │   ├── transformer/         # Data transformers
    │   └── service/             # Core services
    ├── fess-crawler-lasta/       # LastaFlute DI integration
    └── fess-crawler-opensearch/  # OpenSearch backend
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Aug 31 05:32:52 UTC 2025
    - 15.3K bytes
    - Viewed (0)
  2. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/JodExtractor.java

            final Extractor extractor = getExtractor(outExt);
            if (extractor != null) {
                final Map<String, String> params = new HashMap<>();
                params.put(ExtractData.RESOURCE_NAME_KEY, outputFile.getName());
                try (final FileInputStream in = new FileInputStream(outputFile)) {
                    final ExtractData extractData = extractor.getText(in, params);
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 10.3K bytes
    - Viewed (0)
  3. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/PdfExtractor.java

    import org.codelibs.fess.crawler.extractor.Extractor;
    import org.codelibs.fess.crawler.extractor.ExtractorFactory;
    import org.codelibs.fess.crawler.helper.MimeTypeHelper;
    
    /**
     * PdfExtractor extracts text content from PDF files using Apache PDFBox.
     * It supports password-protected PDFs and can extract embedded documents and annotations.
     *
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 12.7K bytes
    - Viewed (0)
  4. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java

    import org.codelibs.fess.crawler.Constants;
    import org.codelibs.fess.crawler.entity.ExtractData;
    import org.codelibs.fess.crawler.exception.ExtractException;
    import org.codelibs.fess.crawler.extractor.Extractor;
    import org.codelibs.fess.crawler.extractor.ExtractorFactory;
    import org.codelibs.fess.crawler.helper.MimeTypeHelper;
    
    import jakarta.mail.Address;
    import jakarta.mail.BodyPart;
    import jakarta.mail.Header;
    import jakarta.mail.Message;
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 12.6K bytes
    - Viewed (0)
  5. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/ExtractorBuilder.java

                dfos = out;
                CopyUtil.copy(in, out);
                out.flush();
    
                Extractor extractor = StringUtil.isBlank(mimeType) ? null : extractorFactory.getExtractor(mimeType);
                if (extractor == null) {
                    final String detectedMimeType = getMimeType(out);
                    extractor = extractorFactory.getExtractor(detectedMimeType);
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 10.1K bytes
    - Viewed (0)
  6. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ApiExtractor.java

     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
     * either express or implied. See the License for the specific language
     * governing permissions and limitations under the License.
     */
    package org.codelibs.fess.crawler.extractor.impl;
    
    import java.io.IOException;
    import java.io.InputStream;
    import java.nio.charset.Charset;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Thu Aug 07 02:55:08 UTC 2025
    - 12.2K bytes
    - Viewed (0)
  7. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/CommandExtractor.java

        /**
         * Constructs a new CommandExtractor.
         */
        public CommandExtractor() {
            // NOP
        }
    
        /*
         * (non-Javadoc)
         *
         * @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
         * java.util.Map)
         */
        @Override
        public ExtractData getText(final InputStream in, final Map<String, String> params) {
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 16K bytes
    - Viewed (0)
  8. fess-crawler/src/test/java/org/codelibs/fess/crawler/helper/impl/MimeTypeHelperImplTest.java

            assertContentType("application/pdf", "extractor/test.pdf", "hoge.pdf");
    
            assertContentType("application/gzip", "extractor/gz/test.tar.gz", "hoge.tar.gz");
            assertContentType("application/zip", "extractor/zip/test.zip", "hoge.zip");
            assertContentType("application/x-lharc", "extractor/lha/test.lzh", "hoge.lzh"); // TODO is it correct?
    
            assertContentType("application/xml", "extractor/test.mm", "hoge.mm");
    
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sat Mar 15 06:52:00 UTC 2025
    - 11.6K bytes
    - Viewed (0)
  9. fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractorTest.java

        // InputStream in =
        // ResourceUtil.getResourceAsStream("extractor/test_pass.pdf");
        // TikaExtractor extractor =
        // (TikaExtractor) getContainer().getComponent(
        // "tikaExtractorForPdfPassword");
        // Map<String, String> params = new HashMap<String, String>();
        // params.put(ExtractData.URL, "http://example.com/test_pass.pdf");
        // ExtractData extractData = extractor.getText(in, params);
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Thu Aug 07 02:55:08 UTC 2025
    - 30.6K bytes
    - Viewed (0)
  10. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/HtmlXpathExtractor.java

    import com.google.common.cache.CacheLoader;
    import com.google.common.cache.LoadingCache;
    
    import jakarta.annotation.Resource;
    
    /**
     * {@link HtmlXpathExtractor} is an implementation of the {@link org.codelibs.fess.crawler.extractor.Extractor} interface.
     * It uses XPath expressions to extract text content from HTML documents.
     * <p>
     * This class provides methods to configure the XPath expressions, parser features, and properties.
    Registered: Sun Sep 21 03:50:09 UTC 2025
    - Last Modified: Sun Jul 06 02:13:03 UTC 2025
    - 10.3K bytes
    - Viewed (0)
Back to top