- Sort Score
- Num 10 results
- Language All
Results 1 - 10 of 50 for extractors (0.16 seconds)
The search processing time has exceeded the limit. The displayed results may be partial.
-
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/ExtractorFactory.java
*/ public Extractor[] getExtractors(final String key) { final Extractor[] extractors = extractorMap.get(key); if (extractors == null || extractors.length == 0) { return new Extractor[0]; } return extractors; } /** * Sets the extractor map with the provided map. * * @param extractorMap a map of keys to arrays of {@link Extractor} objects */Created: Sat Dec 20 11:21:39 GMT 2025 - Last Modified: Sun Nov 23 12:19:14 GMT 2025 - 7.4K bytes - Click Count (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; import jp.gr.java_conf.dangan.util.lha.LhaFile; import jp.gr.java_conf.dangan.util.lha.LhaHeader; /** * Extractor implementation for LHA (LZH) archive files.
Created: Sat Dec 20 11:21:39 GMT 2025 - Last Modified: Sun Nov 23 12:19:14 GMT 2025 - 5.9K bytes - Click Count (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractXmlExtractor.java
import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.ExtractException; /** * Abstract base class for XML extractors. * Provides common functionality for extracting text content from XML-like documents. * It handles encoding detection, HTML entity unescaping, and tag-based content extraction. * */
Created: Sat Dec 20 11:21:39 GMT 2025 - Last Modified: Sun Nov 23 12:19:14 GMT 2025 - 8.6K bytes - Click Count (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/PdfExtractor.java
import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; /** * PdfExtractor extracts text content from PDF files using Apache PDFBox. * It supports password-protected PDFs and can extract embedded documents and annotations. *
Created: Sat Dec 20 11:21:39 GMT 2025 - Last Modified: Sun Nov 23 12:19:14 GMT 2025 - 12.8K bytes - Click Count (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsPowerPointExtractor.java
*/ package org.codelibs.fess.crawler.extractor.impl; import java.io.IOException; import java.io.InputStream; import java.util.Map; import org.apache.poi.hslf.usermodel.HSLFShape; import org.apache.poi.hslf.usermodel.HSLFSlideShow; import org.apache.poi.hslf.usermodel.HSLFTextParagraph; import org.apache.poi.sl.extractor.SlideShowExtractor; import org.codelibs.fess.crawler.entity.ExtractData;
Created: Sat Dec 20 11:21:39 GMT 2025 - Last Modified: Mon Nov 24 03:59:47 GMT 2025 - 2K bytes - Click Count (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/FilenameExtractor.java
*/ package org.codelibs.fess.crawler.extractor.impl; import java.io.InputStream; import java.util.Map; import org.codelibs.core.lang.StringUtil; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.ExtractException; /** * Extracts the filename from the parameters as the content. *
Created: Sat Dec 20 11:21:39 GMT 2025 - Last Modified: Wed Nov 19 08:55:01 GMT 2025 - 2.7K bytes - Click Count (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/MsVisioExtractor.java
* governing permissions and limitations under the License. */ package org.codelibs.fess.crawler.extractor.impl; import java.io.IOException; import java.io.InputStream; import java.util.Map; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException;
Created: Sat Dec 20 11:21:39 GMT 2025 - Last Modified: Sun Nov 23 12:19:14 GMT 2025 - 1.9K bytes - Click Count (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/JsonExtractor.java
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; /** * Extracts text content and metadata from JSON files. * This extractor provides better structured data extraction compared to Tika's generic text extraction. * * <p>Features: * <ul> * <li>Structured text extraction with key-value pairs</li>
Created: Sat Dec 20 11:21:39 GMT 2025 - Last Modified: Sun Nov 23 03:46:53 GMT 2025 - 9.7K bytes - Click Count (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/HtmlExtractor.java
/* * (non-Javadoc) * * @see * org.codelibs.fess.crawler.extractor.impl.AbstractXmlExtractor#getEncodingPattern() */ @Override protected Pattern getEncodingPattern() { return metaCharsetPattern; } /* * (non-Javadoc) * * @see org.codelibs.fess.crawler.extractor.impl.AbstractXmlExtractor#getTagPattern() */ @OverrideCreated: Sat Dec 20 11:21:39 GMT 2025 - Last Modified: Sat Oct 04 08:47:19 GMT 2025 - 9.3K bytes - Click Count (0) -
fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TextExtractor.java
*/ package org.codelibs.fess.crawler.extractor.impl; import java.io.InputStream; import java.util.Map; import org.codelibs.core.io.InputStreamUtil; import org.codelibs.fess.crawler.Constants; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.ExtractException; /** * Extracts text content from an input stream as plain text. */
Created: Sat Dec 20 11:21:39 GMT 2025 - Last Modified: Thu Dec 11 08:38:29 GMT 2025 - 2K bytes - Click Count (0)