Sitemap - Code Search

fess-crawler/src/test/java/org/codelibs/fess/crawler/entity/RobotsTxtTest.java

        robotsTxt.addSitemap("https://example.com/sitemap.xml");
        robotsTxt.addSitemap("https://example.com/sitemap2.xml");

        String[] sitemaps = robotsTxt.getSitemaps();
        assertEquals(2, sitemaps.length);
        assertEquals("https://example.com/sitemap.xml", sitemaps[0]);
        assertEquals("https://example.com/sitemap2.xml", sitemaps[1]);
    }

    public void test_addSitemapNoDuplicates() {

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Nov 13 13:29:22 UTC 2025

- 14.4K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/helper/SitemapsHelper.java

/**
 * Helper class for parsing and validating sitemaps.
 * It supports XML sitemaps, XML sitemap indexes, and text sitemaps,
 * and can handle GZIP compressed sitemaps.
 * The class provides methods to check if an input stream is a valid sitemap,
 * and to parse an input stream into a {@link SitemapSet} object.
 * It uses SAX parser for XML sitemaps and XML sitemap indexes,
 * and handles potential exceptions during parsing.

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Fri Nov 14 13:19:40 UTC 2025

- 34.9K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/java/org/codelibs/fess/crawler/CrawlerContextTest.java

    }

    /**
     * Test sitemaps add and remove operations
     */
    public void test_sitemaps() {
        // Initial state
        assertNull(crawlerContext.removeSitemaps());

        // Add sitemaps
        String[] sitemaps = new String[] { "http://example.com/sitemap.xml", "http://test.com/sitemap.xml" };
        crawlerContext.addSitemaps(sitemaps);

        // Remove and verify

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Sat Sep 06 04:15:37 UTC 2025

- 25.6K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/rule/impl/SitemapsRule.java

 * represents a valid sitemap. It uses a SitemapsHelper to validate the response body as an InputStream.
 * The rule checks if the URL matches the defined regex pattern and then validates the content as a sitemap.
 * If any exception occurs during the sitemap validation, it logs the error and returns false.
 *
 */
public class SitemapsRule extends RegexRule {
    /**

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Sun Jul 06 02:13:03 UTC 2025

- 2.6K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RobotsTxt.java

    }

    /**
     * Adds a sitemap URL to the list of sitemaps.
     *
     * @param url The URL of the sitemap to be added
     */
    public void addSitemap(final String url) {
        if (!sitemapList.contains(url)) {
            sitemapList.add(url);
        }
    }

    /**
     * Returns an array of sitemap URLs.
     *
     * @return an array of sitemap URLs
     */

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Mon Nov 24 03:59:47 UTC 2025

- 18.5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/resources/org/codelibs/fess/crawler/helper/robots_wildcard.txt

Allow: /page

# Test multiple wildcards
User-agent: MultiWildcardBot
Disallow: /*.cgi*
Disallow: /*?*id=*

# Test literal $ in middle of pattern
User-agent: DollarBot
Disallow: /price$info

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Nov 13 14:03:41 UTC 2025

- 910 bytes

- Viewed (0)

github.com/codelibs/fess

src/main/resources/crawler/rule.xml

			<component class="org.codelibs.fess.crawler.processor.impl.SitemapsResponseProcessor">
			</component>
		</property>
		<postConstruct name="addRule">
			<arg>"url"</arg>
			<arg>"http[s]?:.*sitemap[^/]*\.xml.*|http[s]?:.*sitemap[^/]*\.gz.*|http[s]?:.*sitemap[^/]*\.txt.*"</arg>
		</postConstruct>
	</component>

	<component name="webHtmlRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
		<property name="ruleId">"webHtmlRule"</property>

Registered: Sat Dec 20 09:19:18 UTC 2025

- Last Modified: Thu Jun 04 08:42:49 UTC 2020

- 4.6K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/resources/org/codelibs/fess/crawler/helper/robots.txt

User-agent: Crawler
Disallow: /aaa

User-agent: Crawler/1.0
Disallow: /bbb

User-agent: Crawler/2.0
Disallow: /ccc

User-agent: Hoge Crawler
Disallow: /ddd

sitemap: http://www.example.com/sitmap.xml

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Sun Oct 11 02:16:55 UTC 2015

- 566 bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/java/org/codelibs/fess/crawler/rule/impl/RuleManagerImplTest.java

    }

    public void test_getRule_sitemaps5() {
        final ResponseData responseData = new ResponseData();
        responseData.setUrl("http://www.example.com/sitemap/");
        File file = ResourceUtil.getResourceAsFile("sitemaps/sitemap1.xml");
        responseData.setResponseBody(file, false);
        final Rule rule = ruleManager.getRule(responseData);
        assertNotNull(rule);

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Sat Mar 15 06:52:00 UTC 2025

- 6.2K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapNews.java

 * </p>
 *
 * @see <a href="https://developers.google.com/search/docs/crawling-indexing/sitemaps/news-sitemap">Google News Sitemaps</a>
 */
public class SitemapNews implements Serializable {

    private static final long serialVersionUID = 1L;

    /**
     * The name of the news publication.
     * It must exactly match the name as it appears on your articles.
     */

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Nov 13 13:34:36 UTC 2025

- 4.8K bytes

- Viewed (0)

Search Options