Load a processed document (or document shards) from Cloud Storage for post processing.
Explore further
For detailed documentation that includes this code sample, see the following:
Code sample
Python
For more information, see the Document AI Python API reference documentation .
To authenticate to Document AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment .
from
typing
import
Optional
from
google.cloud
import
documentai
from
google.cloud.documentai_toolbox
import
document
,
gcs_utilities
# TODO(developer): Uncomment these variables before running the sample.
# Given a Document JSON or sharded Document JSON in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"
# Or, given a Document JSON in path gs://bucket/path/to/folder/document.json
# gcs_uri = "gs://bucket/path/to/folder/document.json"
# Or, given a Document JSON in path local/path/to/folder/document.json
# document_path = "local/path/to/folder/document.json"
# Or, given a Document object from Document AI
# documentai_document = documentai.Document()
# Or, given a BatchProcessMetadata object from Document AI
# operation = client.batch_process_documents(request)
# operation.result(timeout=timeout)
# batch_process_metadata = documentai.BatchProcessMetadata(operation.metadata)
# Or, given a BatchProcessOperation name from Document AI
# batch_process_operation = "projects/project_id/locations/location/operations/operation_id"
def
quickstart_sample
(
gcs_bucket_name
:
Optional
[
str
]
=
None
,
gcs_prefix
:
Optional
[
str
]
=
None
,
gcs_uri
:
Optional
[
str
]
=
None
,
document_path
:
Optional
[
str
]
=
None
,
documentai_document
:
Optional
[
documentai
.
Document
]
=
None
,
batch_process_metadata
:
Optional
[
documentai
.
BatchProcessMetadata
]
=
None
,
batch_process_operation
:
Optional
[
str
]
=
None
,
)
-
> document
.
Document
:
if
gcs_bucket_name
and
gcs_prefix
:
# Load from Google Cloud Storage Directory
print
(
"Document structure in Cloud Storage"
)
gcs_utilities
.
print_gcs_document_tree
(
gcs_bucket_name
=
gcs_bucket_name
,
gcs_prefix
=
gcs_prefix
)
wrapped_document
=
document
.
Document
.
from_gcs
(
gcs_bucket_name
=
gcs_bucket_name
,
gcs_prefix
=
gcs_prefix
)
elif
gcs_uri
:
# Load a single Document from a Google Cloud Storage URI
wrapped_document
=
document
.
Document
.
from_gcs_uri
(
gcs_uri
=
gcs_uri
)
elif
document_path
:
# Load from local `Document` JSON file
wrapped_document
=
document
.
Document
.
from_document_path
(
document_path
)
elif
documentai_document
:
# Load from `documentai.Document` object
wrapped_document
=
document
.
Document
.
from_documentai_document
(
documentai_document
)
elif
batch_process_metadata
:
# Load Documents from `BatchProcessMetadata` object
wrapped_documents
=
document
.
Document
.
from_batch_process_metadata
(
metadata
=
batch_process_metadata
)
wrapped_document
=
wrapped_documents
[
0
]
elif
batch_process_operation
:
wrapped_documents
=
document
.
Document
.
from_batch_process_operation
(
location
=
"us"
,
operation_name
=
batch_process_operation
)
wrapped_document
=
wrapped_documents
[
0
]
else
:
raise
ValueError
(
"No document source provided."
)
# For all properties and methods, refer to:
# https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document
print
(
"Document Successfully Loaded!"
)
print
(
f
"
\t
Number of Pages:
{
len
(
wrapped_document
.
pages
)
}
"
)
print
(
f
"
\t
Number of Entities:
{
len
(
wrapped_document
.
entities
)
}
"
)
for
page
in
wrapped_document
.
pages
:
print
(
f
"Page
{
page
.
page_number
}
"
)
for
block
in
page
.
blocks
:
print
(
block
.
text
)
for
paragraph
in
page
.
paragraphs
:
print
(
paragraph
.
text
)
for
line
in
page
.
lines
:
print
(
line
.
text
)
for
token
in
page
.
tokens
:
print
(
token
.
text
)
# Only supported with Form Parser processor
# https://cloud.google.com/document-ai/docs/form-parser
for
form_field
in
page
.
form_fields
:
print
(
f
"
{
form_field
.
field_name
}
:
{
form_field
.
field_value
}
"
)
# Only supported with Enterprise Document OCR version `pretrained-ocr-v2.0-2023-06-02`
# https://cloud.google.com/document-ai/docs/process-documents-ocr#enable_symbols
for
symbol
in
page
.
symbols
:
print
(
symbol
.
text
)
# Only supported with Enterprise Document OCR version `pretrained-ocr-v2.0-2023-06-02`
# https://cloud.google.com/document-ai/docs/process-documents-ocr#math_ocr
for
math_formula
in
page
.
math_formulas
:
print
(
math_formula
.
text
)
# Only supported with Entity Extraction processors
# https://cloud.google.com/document-ai/docs/processors-list
for
entity
in
wrapped_document
.
entities
:
print
(
f
"
{
entity
.
type_
}
:
{
entity
.
mention_text
}
"
)
if
entity
.
normalized_text
:
print
(
f
"
\t
Normalized Text:
{
entity
.
normalized_text
}
"
)
# Only supported with Layout Parser
for
chunk
in
wrapped_document
.
chunks
:
print
(
f
"Chunk
{
chunk
.
chunk_id
}
:
{
chunk
.
content
}
"
)
for
block
in
wrapped_document
.
document_layout_blocks
:
print
(
f
"Document Layout Block
{
block
.
block_id
}
"
)
if
block
.
text_block
:
print
(
f
"
{
block
.
text_block
.
type_
}
:
{
block
.
text_block
.
text
}
"
)
if
block
.
list_block
:
print
(
f
"
{
block
.
list_block
.
type_
}
:
{
block
.
list_block
.
list_entries
}
"
)
if
block
.
table_block
:
print
(
block
.
table_block
.
header_rows
,
block
.
table_block
.
body_rows
)
What's next
To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser .