You might find it useful to extract only the text from a document. This is helpful
if you're passing the text to another API service. All the text in a document is
contained across its tabs
in text runs
of paragraph elements
.
Extracting all the text in a document involves traversing the tabs tree
hierarchy and calling getter methods off of Tab
and DocumentTab
. See Work with Tabs
for more information on the tabs
feature.
Text can appear in 3 types of the document tab's structural elements :
- Paragraph
- Table of Contents
- Tables
Tables can be nested inside another table. Therefore, to extract all the text in a document, you must visit each nested structural element.
For a full description of the document body, see the Document Structure guide.
The following Google Docs API sample uses recursion to visit each structural element in all tabs of a document and prints the text.
Java
// Copyright 2019 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. import com.google.api.client.auth.oauth2.Credential ; import com.google.api.client.extensions.java6.auth.oauth2.AuthorizationCodeInstalledApp ; import com.google.api.client.extensions.jetty.auth.oauth2.LocalServerReceiver ; import com.google.api.client.googleapis.auth.oauth2.GoogleAuthorizationCodeFlow ; import com.google.api.client.googleapis.auth.oauth2.GoogleClientSecrets ; import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport ; import com.google.api.client.http.javanet.NetHttpTransport ; import com.google.api.client.json.JsonFactory ; import com.google.api.client.json.jackson2.JacksonFactory ; import com.google.api.client.util.store.FileDataStoreFactory ; import com.google.api.services.docs.v1.Docs ; import com.google.api.services.docs.v1.DocsScopes ; import com.google.api.services.docs.v1.model.Document ; import com.google.api.services.docs.v1.model.DocumentTab ; import com.google.api.services.docs.v1.model.ParagraphElement ; import com.google.api.services.docs.v1.model.StructuralElement ; import com.google.api.services.docs.v1.model.Tab ; import com.google.api.services.docs.v1.model.TableCell ; import com.google.api.services.docs.v1.model.TableRow ; import com.google.api.services.docs.v1.model.TextRun ; import java.io.IOException ; import java.io.InputStream ; import java.io.InputStreamReader ; import java.security.GeneralSecurityException ; import java.util.ArrayList ; import java.util.Collections ; import java.util.List ; public class ExtractText { private static final String APPLICATION_NAME = "Google Docs API Extract Guide" ; private static final JsonFactory JSON_FACTORY = JacksonFactory . getDefaultInstance (); private static final String TOKENS_DIRECTORY_PATH = "tokens" ; private static final String DOCUMENT_ID = "YOUR_DOCUMENT_ID" ; /** * Global instance of the scopes required by this quickstart. If modifying these scopes, delete * your previously saved tokens/ folder. */ private static final List<String> SCOPES = Collections . singletonList ( DocsScopes . DOCUMENTS_READONLY ); private static final String CREDENTIALS_FILE_PATH = "/credentials.json" ; /** * Creates an authorized Credential object. * * @param HTTP_TRANSPORT The network HTTP Transport. * @return An authorized Credential object. * @throws IOException If the credentials.json file cannot be found. */ private static Credential getCredentials ( final NetHttpTransport HTTP_TRANSPORT ) throws IOException { // Load client secrets. InputStream in = ExtractText . class . getResourceAsStream ( CREDENTIALS_FILE_PATH ); GoogleClientSecrets clientSecrets = GoogleClientSecrets . load ( JSON_FACTORY , new InputStreamReader ( in )); // Build flow and trigger user authorization request. GoogleAuthorizationCodeFlow flow = new GoogleAuthorizationCodeFlow . Builder ( HTTP_TRANSPORT , JSON_FACTORY , clientSecrets , SCOPES ) . setDataStoreFactory ( new FileDataStoreFactory ( new java . io . File ( TOKENS_DIRECTORY_PATH ))) . setAccessType ( "offline" ) . build (); LocalServerReceiver receiver = new LocalServerReceiver . Builder (). setPort ( 8888 ). build (); return new AuthorizationCodeInstalledApp ( flow , receiver ). authorize ( "user" ); } /** * Adds the provided tab to the list of all tabs, and recurses through and * adds all child tabs. */ private void addCurrentAndChildTabs ( Tab tab , List<Tab> allTabs ) { allTabs . add ( tab ); for ( Tab tab : tab . getChildTabs ()) { addCurrentAndChildTabs ( tab , allTabs ); } } /** * Returns a flat list of all tabs in the document in the order they would * appear in the UI (top-down ordering). Includes all child tabs. */ private List<Tab> getAllTabs ( Document doc ) { List<Tab> allTabs = new ArrayList <> (); // Iterate over all tabs and recursively add any child tabs to generate a // flat list of Tabs. for ( Tab tab : doc . getTabs ()) { addCurrentAndChildTabs ( tab , allTabs ); } return allTabs ; } /** * Returns the text in the given ParagraphElement. * * @param element a ParagraphElement from a Google Doc */ private static String readParagraphElement ( ParagraphElement element ) { TextRun run = element . getTextRun (); if ( run == null || run . getContent () == null ) { // The TextRun can be null if there is an inline object. return "" ; } return run . getContent (); } /** * Recurses through a list of Structural Elements to read a document's text where text may be in * nested elements. * * @param elements a list of Structural Elements */ private static String readStructuralElements ( List<StructuralElement> elements ) { StringBuilder sb = new StringBuilder (); for ( StructuralElement element : elements ) { if ( element . getParagraph () != null ) { for ( ParagraphElement paragraphElement : element . getParagraph (). getElements ()) { sb . append ( readParagraphElement ( paragraphElement )); } } else if ( element . getTable () != null ) { // The text in table cells are in nested Structural Elements and tables may be // nested. for ( TableRow row : element . getTable (). getTableRows ()) { for ( TableCell cell : row . getTableCells ()) { sb . append ( readStructuralElements ( cell . getContent ())); } } } else if ( element . getTableOfContents () != null ) { // The text in the TOC is also in a Structural Element. sb . append ( readStructuralElements ( element . getTableOfContents (). getContent ())); } } return sb . toString (); } public static void main ( String ... args ) throws IOException , GeneralSecurityException { // Build a new authorized API client service. final NetHttpTransport HTTP_TRANSPORT = GoogleNetHttpTransport . newTrustedTransport (); Docs service = new Docs . Builder ( HTTP_TRANSPORT , JSON_FACTORY , getCredentials ( HTTP_TRANSPORT )) . setApplicationName ( APPLICATION_NAME ) . build (); // Fetch the document with all of the tabs populated, including any nested // child tabs. Document doc = service . documents (). get ( DOCUMENT_ID ). setIncludeTabsContent ( true ). execute (); List<Tab> allTabs = getAllTabs ( doc ); // Print the text from each tab in the document. for ( Tab tab : allTabs ) { // Get the DocumentTab from the generic Tab. DocumentTab documentTab = tab . getDocumentTab (); System . out . println ( readStructuralElements ( documentTab . getBody (). getContent ())); } } }
Python
# Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Recursively extracts the text from a Google Doc. """ import googleapiclient.discovery as discovery from httplib2 import Http from oauth2client import client from oauth2client import file from oauth2client import tools SCOPES = 'https://www.googleapis.com/auth/documents.readonly' DISCOVERY_DOC = 'https://docs.googleapis.com/$discovery/rest?version=v1' DOCUMENT_ID = 'YOUR_DOCUMENT_ID' def get_credentials (): """Gets valid user credentials from storage. If nothing has been stored, or if the stored credentials are invalid, the OAuth 2.0 flow is completed to obtain the new credentials. Returns: Credentials, the obtained credential. """ store = file . Storage ( 'token.json' ) credentials = store . get () if not credentials or credentials . invalid : flow = client . flow_from_clientsecrets ( 'credentials.json' , SCOPES ) credentials = tools . run_flow ( flow , store ) return credentials def add_current_and_child_tabs ( tab , all_tabs ): """Adds the provided tab to the list of all tabs, and recurses through and adds all child tabs. Args: tab: a Tab from a Google Doc. all_tabs: a list of all tabs in the document. """ all_tabs . append ( tab ) for tab in tab . get ( 'childTabs' ): add_current_and_child_tabs ( tab , all_tabs ) def get_all_tabs ( doc ): """Returns a flat list of all tabs in the document in the order they would appear in the UI (top-down ordering). Includes all child tabs. Args: doc: a document. """ all_tabs = [] # Iterate over all tabs and recursively add any child tabs to generate a # flat list of Tabs. for tab in doc . get ( 'tabs' ): add_current_and_child_tabs ( tab , all_tabs ) return all_tabs def read_paragraph_element ( element ): """Returns the text in the given ParagraphElement. Args: element: a ParagraphElement from a Google Doc. """ text_run = element . get ( 'textRun' ) if not text_run : return '' return text_run . get ( 'content' ) def read_structural_elements ( elements ): """Recurses through a list of Structural Elements to read a document's text where text may be in nested elements. Args: elements: a list of Structural Elements. """ text = '' for value in elements : if 'paragraph' in value : elements = value . get ( 'paragraph' ) . get ( 'elements' ) for elem in elements : text += read_paragraph_element ( elem ) elif 'table' in value : # The text in table cells are in nested Structural Elements and tables may # be nested. table = value . get ( 'table' ) for row in table . get ( 'tableRows' ): cells = row . get ( 'tableCells' ) for cell in cells : text += read_structural_elements ( cell . get ( 'content' )) elif 'tableOfContents' in value : # The text in the TOC is also in a Structural Element. toc = value . get ( 'tableOfContents' ) text += read_structural_elements ( toc . get ( 'content' )) return text def main (): """Uses the Docs API to print out the text of a document.""" credentials = get_credentials () http = credentials . authorize ( Http ()) docs_service = discovery . build ( 'docs' , 'v1' , http = http , discoveryServiceUrl = DISCOVERY_DOC ) # Fetch the document with all of the tabs populated, including any nested # child tabs. doc = ( docs_service . documents () . get ( documentId = DOCUMENT_ID , include_tabs_content = True ) . execute () ) all_tabs = get_all_tabs ( doc ) # Print the text from each tab in the document. for tab in all_tabs : # Get the DocumentTab from the generic Tab. document_tab = tab . get ( 'documentTab' ) doc_content = document_tab . get ( 'body' ) . get ( 'content' ) print ( read_structural_elements ( doc_content )) if __name__ == '__main__' : main ()