Extract - Code Search

fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java

import jp.gr.java_conf.dangan.util.lha.LhaFile;
import jp.gr.java_conf.dangan.util.lha.LhaHeader;

/**
 * Extractor implementation for LHA (LZH) archive files.
 * This extractor can extract text content from files within LHA archives
 * by using appropriate extractors for each contained file type.
 *
 * @author shinsuke
 */
public class LhaExtractor extends AbstractExtractor {

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 5.8K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/ExtractorBuilder.java

    /**
     * Sets the MIME type of the content to extract.
     *
     * @param mimeType the MIME type to set
     * @return this builder instance for method chaining
     */
    public ExtractorBuilder mimeType(final String mimeType) {
        this.mimeType = mimeType;
        return this;
    }

    /**
     * Sets the filename of the content to extract.
     *
     * @param filename the filename to set

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 10.1K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java

            }
        } catch (final MaxLengthExceededException e) {
            throw e;
        } catch (final Exception e) {
            if (buf.length() == 0) {
                throw new ExtractException("Could not extract a content.", e);
            }
        }

        return new ExtractData(buf.toString().trim());
    }

    /**
     * Sets the maximum content size.

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 4.5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java

            }
        } catch (final MaxLengthExceededException e) {
            throw e;
        } catch (final Exception e) {
            if (buf.length() == 0) {
                throw new ExtractException("Could not extract a content.", e);
            }
        } finally {
            CloseableUtil.closeQuietly(ais);
        }

        return buf.toString().trim();
    }

    /**

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/transformer/impl/HtmlTransformer.java

        }
        return null;
    }

    /**
     * Extracts URLs from HTML tag attributes using XPath.
     *
     * @param url the base URL for resolving relative URLs
     * @param document the document to extract URLs from
     * @param xpath the XPath expression to select elements
     * @param attr the attribute name to extract URLs from
     * @param encoding the character encoding to use

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 28.5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractXmlExtractor.java

                throw new ExtractException(e);
            }
        }

        return encoding;
    }

    /**
     * Extracts text content from the given content by removing tags and processing attributes.
     * @param content The content to extract from.
     * @return The extracted text.
     */
    protected String extractString(final String content) {

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 8.5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/PdfExtractor.java

import org.codelibs.fess.crawler.helper.MimeTypeHelper;

/**
 * PdfExtractor extracts text content from PDF files using Apache PDFBox.
 * It supports password-protected PDFs and can extract embedded documents and annotations.
 *
 * <p>The extractor runs text extraction in a separate thread with a configurable timeout
 * to prevent hanging on problematic PDF files. It also extracts metadata from the PDF
 * document and includes it in the extraction result.
 *

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 12.7K bytes

- Viewed (0)

github.com/codelibs/fess-suggest

src/main/java/org/codelibs/fess/suggest/converter/KatakanaConverter.java

 * use a tokenizer to process the input.
 *
 * <p>
 * The class provides methods to initialize the converter, convert strings, and
 * check if a tokenizer is enabled. It also includes methods to create a token
 * stream and extract reading information from the stream's attributes, although
 * the tokenizer-related functionality is currently commented out.
 * </p>
 */
public class KatakanaConverter implements ReadingConverter {

Registered: Fri Sep 19 09:08:11 UTC 2025

- Last Modified: Fri Jul 04 14:00:23 UTC 2025

- 6.1K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java

        } catch (final MessagingException e) {
            throw new ExtractException(e);
        }
    }

    /**
     * Puts a value into the extract data with appropriate type conversion.
     *
     * @param data the extract data to store the value in
     * @param key the key for the value
     * @param value the value to store
     */

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 12.6K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/HtmlExtractor.java

            });
            return extractData;
        } finally {
            xpathAPI.remove();
        }
    }

    /**
     * Extracts strings from a document using the specified XPath expression.
     *
     * @param document the DOM document to extract strings from
     * @param path the XPath expression to evaluate
     * @return an array of strings extracted from the document
     */

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 9.3K bytes

- Viewed (0)

Search Options