- Sort Score
- Result 10 results
- Languages All
Results 1 - 10 of 73 for extracted (0.11 sec)
-
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/PdfExtractor.java
import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; /** * PdfExtractor extracts text content from PDF files using Apache PDFBox. * It supports password-protected PDFs and can extract embedded documents and annotations. *
Registered: Sat Dec 20 11:21:39 UTC 2025 - Last Modified: Sun Nov 23 12:19:14 UTC 2025 - 12.8K bytes - Viewed (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; import jp.gr.java_conf.dangan.util.lha.LhaFile; import jp.gr.java_conf.dangan.util.lha.LhaHeader; /** * Extractor implementation for LHA (LZH) archive files.
Registered: Sat Dec 20 11:21:39 UTC 2025 - Last Modified: Sun Nov 23 12:19:14 UTC 2025 - 5.9K bytes - Viewed (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/JodExtractor.java
* @param outExt the output file extension * @return the extracted text content * @throws ExtractException if an error occurs while reading the file */ protected String getOutputContent(final File outputFile, final String outExt) { final Extractor extractor = getExtractor(outExt); if (extractor != null) { final Map<String, String> params = new HashMap<>();Registered: Sat Dec 20 11:21:39 UTC 2025 - Last Modified: Sun Nov 23 12:19:14 UTC 2025 - 10.4K bytes - Viewed (0) -
fess-crawler/src/test/resources/extractor/markdown/test.md
--- title: Sample Markdown Document author: John Doe date: 2025-01-15 tags: - crawler - extractor - markdown --- # Introduction This is a sample Markdown document for testing the MarkdownExtractor. ## Features The extractor should handle: - YAML front matter extraction - Heading structure - **Bold text** and *italic text* - Lists and other formatting ### Code Examples
Registered: Sat Dec 20 11:21:39 UTC 2025 - Last Modified: Sun Nov 23 03:46:53 UTC 2025 - 767 bytes - Viewed (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPublisherExtractor.java
* Creates a new MsPublisherExtractor instance. */ public MsPublisherExtractor() { super(); } /** * Extracts text from the Publisher input stream. * @param in The input stream. * @param params The parameters. * @return The extracted data. */ @Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) {Registered: Sat Dec 20 11:21:39 UTC 2025 - Last Modified: Sun Nov 23 12:19:14 UTC 2025 - 2K bytes - Viewed (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/JsonExtractor.java
protected int maxDepth = 10; /** Separator for key-value pairs in extracted text. */ protected String fieldSeparator = ": "; /** Separator between different fields in extracted text. */ protected String lineSeparator = "\n"; /** Whether to extract top-level fields as metadata. */ protected boolean extractMetadata = true; /** Prefix for flattened nested keys. */Registered: Sat Dec 20 11:21:39 UTC 2025 - Last Modified: Sun Nov 23 03:46:53 UTC 2025 - 9.7K bytes - Viewed (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractXmlExtractor.java
throw new ExtractException(e); } } return encoding; } /** * Extracts text content from the given content by removing tags and processing attributes. * @param content The content to extract from. * @return The extracted text. */ protected String extractString(final String content) { String input = content.replaceAll("[\\r\\n]", " ");
Registered: Sat Dec 20 11:21:39 UTC 2025 - Last Modified: Sun Nov 23 12:19:14 UTC 2025 - 8.6K bytes - Viewed (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPowerPointExtractor.java
* @return The extracted data. */ @Override public ExtractData getText(final InputStream in, final Map<String, String> params) { validateInputStream(in); try (final HSLFSlideShow slideShow = new HSLFSlideShow(in); final SlideShowExtractor<HSLFShape, HSLFTextParagraph> extractor = new SlideShowExtractor<>(slideShow)) { return new ExtractData(extractor.getText());Registered: Sat Dec 20 11:21:39 UTC 2025 - Last Modified: Mon Nov 24 03:59:47 UTC 2025 - 2K bytes - Viewed (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsExcelExtractor.java
final org.apache.poi.hssf.extractor.ExcelExtractor excelExtractor = new org.apache.poi.hssf.extractor.ExcelExtractor(workbook)) { return new ExtractData(excelExtractor.getText()); } catch (final IOException e) { throw new ExtractException("Failed to extract text from Excel document.", e); } }Registered: Sat Dec 20 11:21:39 UTC 2025 - Last Modified: Mon Nov 24 03:59:47 UTC 2025 - 1.9K bytes - Viewed (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractor.java
/** * Creates a new MsVisioExtractor instance. */ public MsVisioExtractor() { super(); } /** * Extracts text from the Visio input stream. * @param in The input stream. * @param params The parameters. * @return The extracted data. */ @Override public ExtractData getText(final InputStream in, final Map<String, String> params) { if (in == null) {Registered: Sat Dec 20 11:21:39 UTC 2025 - Last Modified: Sun Nov 23 12:19:14 UTC 2025 - 1.9K bytes - Viewed (0)