Parse Parse
Parse Certified

Parse files with Apache Tika

yaml
type: io.kestra.plugin.tika.Parse
yaml
id: tika_parse_file
namespace: company.team

inputs:
  - id: file
    type: FILE

tasks:
  - id: parse
    type: io.kestra.plugin.tika.Parse
    from: "{{ inputs.file }}"
    extractEmbedded: true
    store: false

  - id: log_embedded
    type: io.kestra.plugin.core.log.Log
    message: "{{ outputs.parse.result.embedded }}"

yaml
id: tika_parse_image_ocr
namespace: company.team

inputs:
  - id: file
    type: FILE

tasks:
  - id: parse
    type: io.kestra.plugin.tika.Parse
    from: "{{ inputs.file }}"
    ocrOptions:
      strategy: OCR_AND_TEXT_EXTRACTION
    store: true

yaml
id: parse-image-metadata-using-apache-tika
namespace: company.team

tasks:
  - id: get_image
    type: io.kestra.plugin.core.http.Download
    uri: https://kestra.io/blogs/2023-05-31-beginner-guide-kestra.jpg

  - id: tika
    type: io.kestra.plugin.tika.Parse
    from: "{{ outputs.get_image.uri }}"
    store: false
    contentType: TEXT
    ocrOptions:
      strategy: OCR_AND_TEXT_EXTRACTION

  - id: log_metadata
    type: io.kestra.plugin.core.log.Log
    message: "{{ outputs.tika.result.metadata }}"

yaml
id: parse-pdf
namespace: company.team

tasks:
  - id: download_pdf
    type: io.kestra.plugin.core.http.Download
    uri: https://huggingface.co/datasets/kestra/datasets/resolve/main/pdf/app_store.pdf

  - id: parse_text
    type: io.kestra.plugin.tika.Parse
    from: "{{ outputs.download_pdf.uri }}"
    contentType: TEXT
    store: false

  - id: log_extracted_text
    type: io.kestra.plugin.core.log.Log
    message: "{{ outputs.parse_text.result.content }}"
Properties
DefaultXHTML
Possible Values
TEXTXHTMLXHTML_NO_HEADER
Defaultfalse
Default{ "strategy": "NO_OCR" }
Definitions
enableImagePreprocessingbooleanstring
languagestring
strategystring
DefaultNO_OCR
Possible Values
AUTONO_OCROCR_ONLYOCR_AND_TEXT_EXTRACTION
Defaulttrue
Definitions
contentstring
embeddedobject
SubTypestring
metadataobject
Formaturi