extractor - Code Search

src/main/resources/crawler/extractor+tikaExtractor.xml

	"http://dbflute.org/meta/lastadi10.dtd">
<components namespace="fessCrawler">
	<include path="crawler/container.xml" />
	<component name="tikaExtractor"
		class="org.codelibs.fess.crawler.extractor.impl.TikaExtractor">
		<property name="maxCompressionRatio">2</property>
		<property name="maxUncompressionSize">10000000</property>
	</component>

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Sun Aug 25 12:46:12 GMT 2019

- 461 bytes

- Click Count (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/crawler/transformer/FessFileTransformer.java

            throw new FessSystemException("Could not find extractorFactory.");
        }
        final Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
        if (logger.isDebugEnabled()) {
            logger.debug("url={}, extractor={}", responseData.getUrl(), extractor);
        }
        return extractor;
    }

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Fri Nov 28 16:29:12 GMT 2025

- 3.5K bytes

- Click Count (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/crawler/transformer/FessStandardTransformer.java

    }

    /**
     * Gets the appropriate extractor for the given response data.
     * Selects an extractor based on the MIME type or falls back to the Tika extractor.
     *
     * @param responseData the response data containing the document to extract
     * @return the extractor instance for processing the document
     * @throws FessSystemException if no suitable extractor can be found
     */
    @Override

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Fri Nov 28 16:29:12 GMT 2025

- 3.8K bytes

- Click Count (0)

github.com/codelibs/fess-crawler

CLAUDE.md

- **ResponseProcessor**: `DefaultResponseProcessor`, `SitemapsResponseProcessor`, `NullResponseProcessor`
- **Transformer**: `HtmlTransformer`, `XmlTransformer`, `FileTransformer`, etc.
- **Extractor**: Weight-based selection (tries in descending weight order)

### Key Extractors

`TikaExtractor`, `PdfExtractor`, `MsWordExtractor`, `MsExcelExtractor`, `MsPowerPointExtractor`, `ZipExtractor`, `HtmlExtractor`, `MarkdownExtractor`, `EmlExtractor`

Created: Sun Apr 12 03:50:13 GMT 2026

- Last Modified: Thu Mar 12 03:39:20 GMT 2026

- 8.1K bytes

- Click Count (0)

github.com/codelibs/fess

src/test/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformerTest.java

import org.codelibs.fess.Constants;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.unit.UnitFessTestCase;
import org.codelibs.fess.util.ComponentUtil;
import org.junit.jupiter.api.Test;

Created: Tue Mar 31 13:07:34 GMT 2026

- Last Modified: Thu Jan 15 12:54:47 GMT 2026

- 8.1K bytes

- Click Count (0)

github.com/codelibs/fess-crawler

fess-crawler-lasta/src/main/resources/crawler.xml

	<include path="crawler/container.xml"/>
	<include path="crawler/client.xml"/>
	<include path="crawler/rule.xml"/>
	<include path="crawler/filter.xml"/>
	<include path="crawler/interval.xml"/>
	<include path="crawler/extractor.xml"/>
	<include path="crawler/mimetype.xml"/>
	<include path="crawler/encoding.xml"/>
	<include path="crawler/urlconverter.xml"/>
	<include path="crawler/log.xml"/>
	<include path="crawler/sitemaps.xml"/>

Created: Sun Apr 12 03:50:13 GMT 2026

- Last Modified: Tue Nov 28 13:40:25 GMT 2017

- 1.7K bytes

- Click Count (0)

github.com/codelibs/fess-crawler

fess-crawler-opensearch/src/main/resources/crawler_opensearch.xml

    <include path="crawler/client.xml"/>
    <include path="crawler/rule.xml"/>
    <include path="crawler/filter.xml"/>
    <include path="crawler/interval.xml"/>
    <include path="crawler/extractor.xml"/>
    <include path="crawler/mimetype.xml"/>
    <include path="crawler/encoding.xml"/>
    <include path="crawler/urlconverter.xml"/>
    <include path="crawler/log.xml"/>
    <include path="crawler/sitemaps.xml"/>

Created: Sun Apr 12 03:50:13 GMT 2026

- Last Modified: Thu Nov 07 04:44:10 GMT 2024

- 2.2K bytes

- Click Count (0)

github.com/minio/minio

internal/s3select/jstream/README.md

#

[![GoDoc](https://godoc.org/github.com/bcicen/jstream?status.svg)](https://godoc.org/github.com/bcicen/jstream)


`jstream` is a streaming JSON parser and value extraction library for Go.

Unlike most JSON parsers, `jstream` is document position- and depth-aware -- this enables the extraction of values at a specified depth, eliminating the overhead of allocating encompassing arrays or objects; e.g:

Using the below example document:

Created: Sun Apr 05 19:28:12 GMT 2026

- Last Modified: Mon Sep 23 19:35:41 GMT 2024

- 3.2K bytes

- Click Count (0)

github.com/codelibs/fess-suggest

src/main/java/org/codelibs/fess/suggest/index/contents/ContentsParser.java

            ReadingConverter readingConverter, Normalizer normalizer);

    /**
     * Parses a document and extracts suggest items based on the provided fields and converters.
     *
     * @param document The document to parse, represented as a map of field names to values.
     * @param fields The fields to extract from the document.
     * @param tagFieldNames The names of the fields that contain tags.

Created: Fri Apr 17 09:08:13 GMT 2026

- Last Modified: Sat Mar 15 06:51:20 GMT 2025

- 4.1K bytes

- Click Count (0)

github.com/minio/minio

internal/crypto/sse-c.go

	return metadata
}

// ParseMetadata extracts all SSE-C related values from the object metadata
// and checks whether they are well-formed. It returns the sealed object key
// on success.
func (ssec) ParseMetadata(metadata map[string]string) (sealedKey SealedKey, err error) {
	// Extract all required values from object metadata
	b64IV, ok := metadata[MetaIV]
	if !ok {

Created: Sun Apr 05 19:28:12 GMT 2026

- Last Modified: Sun Sep 28 20:59:21 GMT 2025

- 5.2K bytes

- Click Count (0)

Search Options