Search Options

Results per page
Sort
Preferred Languages
Advance

Results 1 - 10 of 50 for Pdf (0.03 sec)

  1. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/PdfExtractor.java

    /**
     * PdfExtractor extracts text content from PDF files using Apache PDFBox.
     * It supports password-protected PDFs and can extract embedded documents and annotations.
     *
     * <p>The extractor runs text extraction in a separate thread with a configurable timeout
     * to prevent hanging on problematic PDF files. It also extracts metadata from the PDF
     * document and includes it in the extraction result.
     *
     * <p>Features:
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Sun Nov 23 12:19:14 UTC 2025
    - 12.8K bytes
    - Viewed (0)
  2. fess-crawler/src/test/java/org/codelibs/fess/crawler/entity/ExtractDataTest.java

            data.putValue(ExtractData.RESOURCE_NAME_KEY, "test.pdf");
            data.putValue(ExtractData.URL, "https://example.com/test.pdf");
            data.putValues(ExtractData.FILE_PASSWORDS, new String[] { "pass1", "pass2" });
    
            assertEquals("test.pdf", data.getValues(ExtractData.RESOURCE_NAME_KEY)[0]);
            assertEquals("https://example.com/test.pdf", data.getValues(ExtractData.URL)[0]);
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Mon Nov 24 03:59:47 UTC 2025
    - 9.9K bytes
    - Viewed (0)
  3. fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/FilenameExtractorEnhancedTest.java

            final Map<String, String> params = new HashMap<>();
            params.put(ExtractData.RESOURCE_NAME_KEY, "test-document.pdf");
    
            final ExtractData result = filenameExtractor.getText(in, params);
    
            assertNotNull(result);
            assertEquals("test-document.pdf", result.getContent());
        }
    
        /**
         * Test extraction with null parameters map.
         */
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Mon Nov 24 03:59:47 UTC 2025
    - 7K bytes
    - Viewed (0)
  4. fess-crawler/src/test/java/org/codelibs/fess/crawler/helper/RobotsTxtHelperTest.java

            }
    
            // Test WildcardBot - wildcard patterns
            // Disallow: /*.pdf$ - should block .pdf files but not .pdf with query params
            assertFalse(robotsTxt.allows("/document.pdf", "WildcardBot"));
            assertFalse(robotsTxt.allows("/files/report.pdf", "WildcardBot"));
            assertTrue(robotsTxt.allows("/document.pdf?download=true", "WildcardBot")); // $ means exact end
    
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Mon Nov 24 03:59:47 UTC 2025
    - 20.6K bytes
    - Viewed (0)
  5. src/main/resources/fess_thumbnail.xml

    		<property name="commandList">
    			["${path}/generate-thumbnail",
    			"pdf",
    			"${url}",
    			"${outputFile}"]
    		</property>
    		<property name="generatorList">
    			["${path}/generate-thumbnail"]
    		</property>
    		<postConstruct name="addCondition">
    			<arg>"mimetype"</arg>
    			<arg>"application/pdf"
    			</arg>
    		</postConstruct>
    		<postConstruct name="register"></postConstruct>
    	</component>
    Registered: Sat Dec 20 09:19:18 UTC 2025
    - Last Modified: Thu Dec 04 08:02:36 UTC 2025
    - 6K bytes
    - Viewed (0)
  6. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/JodExtractor.java

            // Presentation Formats
            extensionMap.put("odp", "pdf");
            extensionMap.put("otp", "pdf");
            extensionMap.put("sxi", "pdf");
            extensionMap.put("ppt", "pdf");
            extensionMap.put("pptx", "pdf");
            // Drawing Formats
            extensionMap.put("odg", "svg");
            extensionMap.put("otg", "svg");
    
            extractorMap.put("pdf", new PdfExtractor());
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Sun Nov 23 12:19:14 UTC 2025
    - 10.4K bytes
    - Viewed (0)
  7. src/main/assemblies/files/generate-thumbnail

      if [[ -z "${im_cmd}" ]] ; then
        echo "ImageMagick (convert or magick) does not exist."
        exit 1
      fi
      check_command pdftoppm
      check_command unoconv
      tmp_pdf_file=/tmp/thumbnail.$$.pdf
      unoconv -e PageRange=1-1 -o ${tmp_pdf_file} -f pdf "${target_file}"
      if [[ ! -f ${tmp_pdf_file} ]] ; then
        echo "unoconv does not work."
        exit 1
      fi
      tmp_png_prefix=/tmp/thumbnail.png.$$
    Registered: Sat Dec 20 09:19:18 UTC 2025
    - Last Modified: Thu Dec 04 08:02:36 UTC 2025
    - 3.9K bytes
    - Viewed (0)
  8. fess-crawler/src/test/resources/org/codelibs/fess/crawler/helper/robots_wildcard.txt

    # Test robots.txt for wildcard (*) and end-of-path ($) support
    # Based on RFC 9309 specification
    
    # Test wildcard patterns
    User-agent: WildcardBot
    Disallow: /*.pdf$
    Disallow: /admin/*.php
    Disallow: /*/private/
    Allow: /public/*.html
    
    # Test end-of-path ($) patterns
    User-agent: EndPathBot
    Disallow: /fish$
    Disallow: /temp$
    Allow: /fishing
    
    # Test complex patterns
    User-agent: ComplexBot
    Disallow: /
    Allow: /$
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Thu Nov 13 14:03:41 UTC 2025
    - 910 bytes
    - Viewed (0)
  9. fess-crawler/src/test/java/org/codelibs/fess/net/protocol/gcs/HandlerTest.java

            URL url = new URL("gcs://mybucket/dir1/dir2/file.pdf");
            Handler handler = new Handler();
            URLConnection conn = handler.openConnection(url);
    
            Handler.GcsURLConnection gcsConn = (Handler.GcsURLConnection) conn;
    
            assertEquals("mybucket", getField(gcsConn, "bucketName"));
            assertEquals("dir1/dir2/file.pdf", getField(gcsConn, "objectName"));
        }
    
        /**
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Thu Dec 11 08:38:29 UTC 2025
    - 14.1K bytes
    - Viewed (0)
  10. fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TikaExtractor.java

     *   <li>Maximum term sizes for alphanumeric and symbolic terms</li>
     *   <li>Custom Tika configuration</li>
     *   <li>Tesseract OCR configuration for image-based documents</li>
     *   <li>PDF Parser configuration for PDF documents</li>
     * </ul>
     *
     * <p>
     * The {@link TikaDetectParser} inner class extends {@link CompositeParser} to provide auto-detection of the MIME type
    Registered: Sat Dec 20 11:21:39 UTC 2025
    - Last Modified: Sun Nov 23 12:19:14 UTC 2025
    - 30.8K bytes
    - Viewed (0)
Back to top