Burles - Code Search

fess-crawler/src/main/java/org/codelibs/fess/crawler/helper/UrlConvertHelper.java

/**
 * Helper class for converting URLs based on a set of predefined rules.
 *
 * <p>This class provides functionality to convert URLs by replacing parts of the URL
 * based on a map of target strings and their corresponding replacements. It allows
 * adding new conversion rules, setting the entire conversion map, and converting
 * URLs using these rules.</p>
 *

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 3.1K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

README.md

#### Content Processing Pipeline
- **Extractors**: Content extraction from various formats
- **Transformers**: Data transformation and enrichment
- **Filters**: URL filtering with regex patterns
- **Rules**: Content processing rules and validation

## Building and Testing

### Build Commands

```bash
# Build all modules
mvn clean install

# Build without tests
mvn clean install -DskipTests

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Aug 31 05:32:52 UTC 2025

- 15.3K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RobotsTxt.java

        return null;
    }

    /**
     * Adds a directive to the robots.txt rules.
     * The user-agent pattern in the directive is converted to a regular expression pattern,
     * where '*' is replaced with '.*' for pattern matching, and stored case-insensitively.
     *
     * @param directive The directive to add to the robots.txt rules
     */
    public void addDirective(final Directive directive) {

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 10K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/java/org/codelibs/fess/crawler/rule/impl/AbstractRuleTest.java

        testRule.setRuleId("middle");
        testRule.register(1);

        List<Rule> rules = ruleManager.getRules();
        assertEquals(3, rules.size());
        assertEquals("rule1", rules.get(0).getRuleId());
        assertEquals("middle", rules.get(1).getRuleId());
        assertEquals("rule2", rules.get(2).getRuleId());
    }

    /**
     * Test register method with last index
     */

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Wed Sep 03 14:42:53 UTC 2025

- 21.9K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/transformer/impl/HtmlTransformer.java

 *   <li>Extracting child URLs from the HTML content based on configured rules.</li>
 *   <li>Handling redirect URLs specified in the response headers.</li>
 * </ol>
 * <p>
 * The class also provides methods for configuring features and properties of the
 * underlying DOM parser, as well as defining rules for extracting child URLs
 * from specific HTML tags and attributes.
 * </p>
 *
 * <p>

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 28.5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/filter/impl/UrlFilterImpl.java

import jakarta.annotation.Resource;

/**
 * Implementation of the {@link UrlFilter} interface.
 * This class provides functionality to filter URLs based on include and exclude patterns.
 * It uses a {@link UrlFilterService} to manage the URL filtering rules.
 * The class supports caching of include and exclude patterns for scenarios where a session ID is not available.

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 9.2K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/CrawlerContext.java

    }

    /**
     * Returns the set of robots.txt URLs.
     * @return The set of robots.txt URLs.
     */
    public Set<String> getRobotsTxtUrlSet() {
        return robotsTxtUrlSet;
    }

    /**
     * Sets the set of robots.txt URLs.
     * @param robotsTxtUrlSet The set of robots.txt URLs.
     */
    public void setRobotsTxtUrlSet(final Set<String> robotsTxtUrlSet) {

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 8.9K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/java/org/codelibs/fess/crawler/client/smb/SmbClientTest.java

        } catch (final ChildUrlsException e) {
            String[] urls = e.getChildUrlList().stream().map(r -> r.getUrl()).sorted().toArray(String[]::new);
            assertEquals(3, urls.length);
            assertEquals(baseUrl + "dir1/", urls[0]);
            assertEquals(baseUrl + "dir3/", urls[1]);
            assertEquals(baseUrl + "file1.txt", urls[2]);
        }
        try {
            smbClient.doGet(baseUrl + "dir1/");

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sat Sep 06 04:15:37 UTC 2025

- 13.7K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/rule/RuleManager.java

 */
package org.codelibs.fess.crawler.rule;

import org.codelibs.fess.crawler.entity.ResponseData;

/**
 * The RuleManager interface provides methods to manage rules for processing response data.
 * It allows adding, retrieving, and removing rules, as well as checking for their existence.
 */
public interface RuleManager {

    /**
     * Retrieves the rule associated with the given response data.
     *

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 2.1K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/java/org/codelibs/fess/crawler/rule/RuleManagerTest.java

        List<Rule> rules = ruleManager.getRules();
        assertEquals(4, rules.size());
        assertEquals("rule1", rules.get(0).getRuleId());
        assertEquals("rule2", rules.get(1).getRuleId());
        assertEquals("rule3", rules.get(2).getRuleId());
        assertEquals("rule4", rules.get(3).getRuleId());
    }

    /**
     * Test adding duplicate rules
     */

Registered: Sun Sep 21 03:50:09 UTC 2025

- Last Modified: Sat Sep 06 04:15:37 UTC 2025

- 23.8K bytes

- Viewed (0)

Search Options