Parse
Parse files with Apache Tika
Parse
Parse files with Apache Tika
yaml
type: io.kestra.plugin.tika.ParseExamples
yaml
id: tika_parse_file
namespace: company.team
inputs:
- id: file
type: FILE
tasks:
- id: parse
type: io.kestra.plugin.tika.Parse
from: "{{ inputs.file }}"
extractEmbedded: true
store: false
- id: log_embedded
type: io.kestra.plugin.core.log.Log
message: "{{ outputs.parse.result.embedded }}"
yaml
id: tika_parse_image_ocr
namespace: company.team
inputs:
- id: file
type: FILE
tasks:
- id: parse
type: io.kestra.plugin.tika.Parse
from: "{{ inputs.file }}"
ocrOptions:
strategy: OCR_AND_TEXT_EXTRACTION
store: true
yaml
id: parse-image-metadata-using-apache-tika
namespace: company.team
tasks:
- id: get_image
type: io.kestra.plugin.core.http.Download
uri: https://kestra.io/blogs/2023-05-31-beginner-guide-kestra.jpg
- id: tika
type: io.kestra.plugin.tika.Parse
from: "{{ outputs.get_image.uri }}"
store: false
contentType: TEXT
ocrOptions:
strategy: OCR_AND_TEXT_EXTRACTION
- id: log_metadata
type: io.kestra.plugin.core.log.Log
message: "{{ outputs.tika.result.metadata }}"
yaml
id: parse-pdf
namespace: company.team
tasks:
- id: download_pdf
type: io.kestra.plugin.core.http.Download
uri: https://huggingface.co/datasets/kestra/datasets/resolve/main/pdf/app_store.pdf
- id: parse_text
type: io.kestra.plugin.tika.Parse
from: "{{ outputs.download_pdf.uri }}"
contentType: TEXT
store: false
- id: log_extracted_text
type: io.kestra.plugin.core.log.Log
message: "{{ outputs.parse_text.result.content }}"
Properties
charactersLimit integerstring
contentType string
Default
XHTMLPossible Values
TEXTXHTMLXHTML_NO_HEADERextractEmbedded booleanstring
Default
falsefrom string
ocrOptions Non-dynamic
Default
{
"strategy": "NO_OCR"
} Definitions
io.kestra.plugin.tika.Parse-OcrOptions
enableImagePreprocessingbooleanstring
languagestring
strategystring
Default
NO_OCRPossible Values
AUTONO_OCROCR_ONLYOCR_AND_TEXT_EXTRACTIONstore booleanstring
Default
trueOutputs
result
Definitions
io.kestra.plugin.tika.Parse-Parsed
contentstring
embeddedobject
SubTypestring
metadataobject
uri string
Format
uri