Extraction - Code Search

fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/CsvExtractor.java

/**
 * Extracts text content and metadata from CSV files.
 * This extractor provides better structured data extraction compared to Tika's generic text extraction.
 *
 * <p>Features:
 * <ul>
 *   <li>Automatic delimiter detection (comma, tab, semicolon, pipe)</li>
 *   <li>Header row detection and extraction</li>
 *   <li>Column name to data value association</li>
 *   <li>Quoted field handling</li>

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Dec 11 08:38:29 UTC 2025

- 12.8K bytes

- Viewed (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

        }
        return URI.create(currentUrl);
    }

    /**
     * Gets child URL extraction rules from configuration.
     *
     * @param responseData the response data from crawling
     * @param resultData the result data
     * @return stream of tag-attribute pairs for URL extraction
     */
    @Override

Registered: Sat Dec 20 09:19:18 UTC 2025

- Last Modified: Fri Dec 12 13:58:40 UTC 2025

- 54.6K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TextExtractor.java

        } catch (final Exception e) {
            throw new ExtractException("Failed to extract text content using encoding: " + getEncoding(), e);
        }
    }

    /**
     * Returns the encoding used for text extraction.
     * @return the encoding
     */
    public String getEncoding() {
        return encoding;
    }

    /**
     * Sets the encoding.
     * @param encoding The encoding to set.
     */

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Dec 11 08:38:29 UTC 2025

- 2K bytes

- Viewed (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/crawler/transformer/FessFileTransformer.java

import jakarta.annotation.PostConstruct;

/**
 * File transformer implementation for the Fess search engine.
 * This transformer handles file-based document transformation and content extraction
 * using the Fess file transformation process with support for various file types.
 *
 * <p>It extends AbstractFessFileTransformer to provide specialized file processing

Registered: Sat Dec 20 09:19:18 UTC 2025

- Last Modified: Fri Nov 28 16:29:12 UTC 2025

- 3.5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

CLAUDE.md

**Fess Crawler** is a Java-based web crawling framework for enterprise content extraction.

### Essential Info

- **Language**: Java 21+
- **Build**: Maven 3.x
- **License**: Apache 2.0
- **DI**: LastaFlute DI
- **Repo**: https://github.com/codelibs/fess-crawler

### Tech Stack

- **HTTP**: Apache HttpComponents 4.5+
- **Extraction**: Apache Tika 3.0+, POI 5.3+, PDFBox 3.0+
- **Testing**: JUnit 4, UTFlute, Mockito 5.7.0

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Fri Nov 28 17:31:34 UTC 2025

- 10.7K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/transformer/impl/HtmlTransformer.java

        this.propertyMap = propertyMap;
    }

    /**
     * Gets the map of child URL extraction rules.
     *
     * @return the child URL rule map
     */
    public Map<String, String> getChildUrlRuleMap() {
        return childUrlRuleMap;
    }

    /**
     * Sets the map of child URL extraction rules.
     *
     * @param childUrlRuleMap the child URL rule map to set
     */

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Sat Nov 29 07:42:33 UTC 2025

- 30.5K bytes

- Viewed (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/helper/ThemeHelper.java

import org.codelibs.fess.helper.PluginHelper.ArtifactType;
import org.codelibs.fess.util.ResourceUtil;

/**
 * Helper class for managing theme installation and uninstallation.
 * Handles the extraction and deployment of theme files from JAR artifacts.
 */
public class ThemeHelper {
    private static final Logger logger = LogManager.getLogger(ThemeHelper.class);

    /**
     * Default constructor for ThemeHelper.

Registered: Sat Dec 20 09:19:18 UTC 2025

- Last Modified: Fri Nov 28 16:29:12 UTC 2025

- 7.1K bytes

- Viewed (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/helper/DocumentHelper.java

/**
 * Helper class for document processing and manipulation in the Fess search system.
 * This class provides utilities for processing document content, titles, and digests,
 * handling text normalization, content extraction, and similar document hash encoding/decoding.
 * It also manages document processing requests and integrates with the crawler system.
 *
 */
public class DocumentHelper {

Registered: Sat Dec 20 09:19:18 UTC 2025

- Last Modified: Fri Nov 28 16:29:12 UTC 2025

- 17.4K bytes

- Viewed (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/crawler/transformer/FessStandardTransformer.java

import org.codelibs.fess.util.ComponentUtil;

import jakarta.annotation.PostConstruct;

/**
 * Standard transformer implementation for the Fess search engine.
 * This transformer handles document transformation and content extraction using
 * the standard Fess file transformation process with support for various content types.
 *
 * <p>It extends AbstractFessFileTransformer to provide file-specific transformation

Registered: Sat Dec 20 09:19:18 UTC 2025

- Last Modified: Fri Nov 28 16:29:12 UTC 2025

- 3.8K bytes

- Viewed (0)

github.com/codelibs/fess

src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java

        }
        return false;
    }

    /**
     * Create the parameters for extraction.
     * @param responseData The response data.
     * @param crawlingConfig The crawling configuration.
     * @return The parameters for extraction.
     */
    protected Map<String, String> createExtractParams(final ResponseData responseData, final CrawlingConfig crawlingConfig) {

Registered: Sat Dec 20 09:19:18 UTC 2025

- Last Modified: Fri Nov 28 16:29:12 UTC 2025

- 25.7K bytes

- Viewed (0)

Search Options