sitemap - Code Search

fess-crawler/src/test/java/org/codelibs/fess/crawler/processor/impl/SitemapsResponseProcessorTest.java

        // Test handling of duplicate URLs in sitemap
        ResponseData responseData = new ResponseData();
        byte[] content = "<sitemap></sitemap>".getBytes();
        responseData.setResponseBody(content);

        SitemapUrl sitemap1 = new SitemapUrl();
        sitemap1.setLoc("https://example.com/duplicate");

        SitemapUrl sitemap2 = new SitemapUrl();
        sitemap2.setLoc("https://example.com/duplicate");

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Nov 13 13:29:22 UTC 2025

- 12K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/java/org/codelibs/fess/crawler/helper/SitemapsHelperTest.java

                        + "  </sitemap>\n" + "  <sitemap>\n" + "    <lastmod>2025-01-02</lastmod>\n" + "  </sitemap>\n" + "  <sitemap>\n"
                        + "    <loc>http://www.example.com/sitemap2.xml</loc>\n" + "  </sitemap>\n" + "</sitemapindex>";
        final InputStream in = new ByteArrayInputStream(xml.getBytes());
        final SitemapSet sitemapSet = sitemapsHelper.parse(in);
        final Sitemap[] sitemaps = sitemapSet.getSitemaps();

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Mon Nov 24 03:59:47 UTC 2025

- 36.7K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/resources/org/codelibs/fess/crawler/helper/robots_malformed.txt

User-agent: Bot1
User-agent: Bot2
User-agent: Bot3
Disallow: /shared/

# Case 11: Sitemap with various formats
Sitemap: http://example.com/sitemap.xml
sitemap: http://example.com/sitemap2.xml
SITEMAP: http://example.com/sitemap3.xml
Sitemap:    # empty sitemap (should be ignored)
Sitemap: not-a-valid-url

# Case 12: Malformed lines that should be completely ignored
This line is completely invalid
:NoKey

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Fri Nov 14 12:52:01 UTC 2025

- 2.6K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/java/org/codelibs/fess/crawler/entity/RobotsTxtTest.java

        robotsTxt.addSitemap("https://example.com/sitemap.xml");
        robotsTxt.addSitemap("https://example.com/sitemap2.xml");

        String[] sitemaps = robotsTxt.getSitemaps();
        assertEquals(2, sitemaps.length);
        assertEquals("https://example.com/sitemap.xml", sitemaps[0]);
        assertEquals("https://example.com/sitemap2.xml", sitemaps[1]);
    }

    public void test_addSitemapNoDuplicates() {

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Nov 13 13:29:22 UTC 2025

- 14.4K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapUrl.java

/**
 * Represents a URL entry within a sitemap.
 *
 * <p>
 * This class encapsulates the properties of a URL as defined in the sitemap XML format,
 * including its location, last modification date, change frequency, and priority.
 * It also supports sitemap extensions such as images, videos, news, and alternate links.
 * It implements the {@link Sitemap} interface.
 * </p>
 *
 * <p>

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Nov 13 13:34:36 UTC 2025

- 9.1K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/helper/SitemapsHelper.java

/**
 * Helper class for parsing and validating sitemaps.
 * It supports XML sitemaps, XML sitemap indexes, and text sitemaps,
 * and can handle GZIP compressed sitemaps.
 * The class provides methods to check if an input stream is a valid sitemap,
 * and to parse an input stream into a {@link SitemapSet} object.
 * It uses SAX parser for XML sitemaps and XML sitemap indexes,
 * and handles potential exceptions during parsing.

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Fri Nov 14 13:19:40 UTC 2025

- 34.9K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/RobotsTxt.java

    }

    /**
     * Adds a sitemap URL to the list of sitemaps.
     *
     * @param url The URL of the sitemap to be added
     */
    public void addSitemap(final String url) {
        if (!sitemapList.contains(url)) {
            sitemapList.add(url);
        }
    }

    /**
     * Returns an array of sitemap URLs.
     *
     * @return an array of sitemap URLs
     */

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Mon Nov 24 03:59:47 UTC 2025

- 18.5K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/test/resources/org/codelibs/fess/crawler/helper/robots_wildcard.txt

Allow: /page

# Test multiple wildcards
User-agent: MultiWildcardBot
Disallow: /*.cgi*
Disallow: /*?*id=*

# Test literal $ in middle of pattern
User-agent: DollarBot
Disallow: /price$info

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Nov 13 14:03:41 UTC 2025

- 910 bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapNews.java

 * </p>
 *
 * @see <a href="https://developers.google.com/search/docs/crawling-indexing/sitemaps/news-sitemap">Google News Sitemaps</a>
 */
public class SitemapNews implements Serializable {

    private static final long serialVersionUID = 1L;

    /**
     * The name of the news publication.
     * It must exactly match the name as it appears on your articles.
     */

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Nov 13 13:34:36 UTC 2025

- 4.8K bytes

- Viewed (0)

github.com/codelibs/fess-crawler

fess-crawler/src/main/java/org/codelibs/fess/crawler/entity/SitemapImage.java

 * </p>
 *
 * @see <a href="https://developers.google.com/search/docs/crawling-indexing/sitemaps/image-sitemaps">Google Image Sitemaps</a>
 */
public class SitemapImage implements Serializable {

    private static final long serialVersionUID = 1L;

    /**
     * The URL of the image.
     * In some cases, the image URL may not be on the same domain as your main site.
     */
    private String loc;

Registered: Sat Dec 20 11:21:39 UTC 2025

- Last Modified: Thu Nov 13 13:34:36 UTC 2025

- 3.8K bytes

- Viewed (0)

Search Options