Python: Extract data from PDF

Convert PDF document to text and scrap relevant data


In this post I will illustrate how to convert a PDF into a text d using python and then extract relevant information. If you want to further learn about regular expression and relevant functions I have specific post addressing those issues. If you are seeking to apply this methodology to a word document there is a separate blog post addressing that issue as well. There is also a psot comparing SAS regex function to python regex functions.







What are the key steps in extracting data from a PDF?


1) Convert the PDF to text. Though point to consider is, if the PDF was created from a JPG or PNG then you might have to use OCR to read in the characters from the image even when in PDF format.

I have used "PDFMINER" for this task.


2) Read the text document and get the relevant text, string and information . I achieved this using regular expression.


3) Loop through all the document and get the relevant information


4) Put this information into a data-frame / table for analysis.


 

First Step: Create a function to convert each document into a text:


from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
from io import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

I have dumb downed the above function though one can pass parameters such as password if the file is password protected. The only input parameter required is the path to the pdf file which want to convert into a text document and extract information from.


 

Well yes that function is useless unless you can process many individual documents at the same time, get the relevant information and get a table out of it. Well to that end we need to put this function in a loop and process those documents.


To this end the code below shows how to loop through multiple pdf document and process them.


import json              
import re
import os
import pandas as pd
#import docx2txt
import subprocess
#IMPORT THE RELEVANT PACKAGES
subprocess.call('dir', shell=True)


filepath = r"A:\insert your path here for the folder where all the files are"
files = os.listdir(filepath)

#FIRST WE ARE GOING TO CREATE A LIST CONTAINING FILES AND THEIR PATH INTO LIST THAT WOULD BE THE FIRST STEP AND THEN LOOP OVER THAT LIST TO PROCESS
document_list = [] 

#EMPTY DATAFRAME WHICH WE WILL FILL IN AFTER PROCESSING ALL DOCUMENTS
df_new = pd.DataFrame()

#COLUMN NAMES FOR THE DATA FRAME
cnames = ['p_id', 'p_name', ']
for path, subdirs, files in os.walk(r"A:\SILPDFT"): 
    for name in files:
# For each file we find, we need to ensure it is a PDF file before 
    adding it to our list
        if os.path.splitext(os.path.join(path, name))[1] == ".pdf":
            if not name.startswith('~$'):
 # APPEND ALL PDF DOCUMENT IN THE FOLDER TO THE DOCUMENT LIST
                document_list.append(os.path.join(path, name))

 

Of course we need to now create a loop which will process each of this document. Remember I have created an empty data frame, below code is only adding two columns/variable. This skeletal code though can be expanded to extract complex data using regular expression.



# THE SECOND LOOP STARTS HERE IT LOOPS THROUGH EACH DOCUMENT
#  IT THEN USED THE PDF CONVERT FUNCTION TO GET TEXT OUT OF PDF   
for document_path in document_list:
    #EMPTY LIST TO GET THE SCRAPPED DATA
    data = []
    # GET THAT PDF TO TEXT
    try:
        document = convert_pdf_to_txt(document_path)
    except:
        continue
    dfObj = pd.DataFrame()
    # IN THIS EXAMPLE MY FILE NAME CONSIST OF A PERSONAL ID e.g.         
     123455_mydetails.pdf
    #BELOW CODE WILL EXTRACT THAT ID FROM THE DOCUMENT PATH
    pid = os.path.basename(document_path)
    #SPLIT AND GET THE ID
    listn = pid.split("_")
    if len(listn) == 1:
        pidn = None
    elif len(listn) > 3:
        pidn= listn[0]
    else:
        pidn = None
        pass
    #BELOW REGULAR EXPRESSIONS(two) ARE LOOKING FOR PERSONS ID IN THE 
     DOCUMENT AND STRING THEM IN A VARIABLE
    result = re.search('licencee licence  number: (.*)\n\n', document)    
    resulta = re.search('ID NUMBER for(.*)', document) 
    #IF WE MATCH AND HIT FIRST REGULAR EXPRESSION PATTERN THEN IT IS 
     THE ID
    if result:
        p_id = (result.group(1))
     #IF WE MATCH AND HIT SECOND REGULAR EXPRESSION PATTERN THEN IT IS 
     THE ID     
    elif resulta:
        p_id_a = resulta.group(0)
        p_id = re.findall(r'\d+',p_id_a)
     #IF BOTH PATTERNS DO NOT MATCH ARE NOT FOUND THEN GET THE ID     
     DERIVED FROM FILE NAME
    else:
        p_id = pidn
    #NOW EXTRACT TH LICENSEE NAME
    result1 = re.search('Licensee name:(.*)\n\n', document)
    # IF ABOVE REGULAR EXPRESSES MATCHES EXTRACT GROUP1    
    if result1:
        p_name = (result1.group(1)) 
    # IF NOT GO ONE STEP ABOVE AND IF THAT PATTERN MATCHES
    # IF YES THEN FIND FOR THE ANOTHER REGULAR EXPRESSION      
    elif resulta:
        resultb = re.search('License for(.*)\s\(?\d+?', 
                             document)
        if resultb:
            p_name = resultb.group(1)
        else:
            p_name = resulta.group(1)
    #IF ALL ELSE FAILS NAME IS NULL
    else:
        p_name = None 

 

Now we are getting to something useful! So we converted the PDF to text and then extracted useful information. Now the last step is to store this information.


 # APPEND THE EXTRACTED INFORMATION TO THE LIST
 data.append([p_id])
 data.append([p_name])
 # CONVERT THE LIST TO A DATA FRAME
 df = pd.DataFrame(data)
 #TRANSPOSE THAT DATA FRAME TO GET SEPARATE COLUMN
 df_tr= df.transpose()
 #NAME/RENAME THE COLUMNS
 df_tr.rename(columns = {0:'p_id', 1:'p_name'}, inplace = True)
 #APPEND THE STAGING DATA FRAME TO THE MAIN DATA FRAME
 df_new = df_new.append(df_tr) 

Complete code below

import json              
import re
import os
import pandas as pd
#import docx2txt
import subprocess
#IMPORT THE RELEVANT PACKAGES
subprocess.call('dir', shell=True)


filepath = r"A:\insert your path here for the folder where all the files are"
files = os.listdir(filepath)

#FIRST WE ARE GOING TO CREATE A LIST CONTAINING FILES AND THEIR PATH INTO LIST THAT WOULD BE THE FIRST STEP AND THEN LOOP OVER THAT LIST TO PROCESS
document_list = [] 

#EMPTY DATAFRAME WHICH WE WILL FILL IN AFTER PROCESSING ALL DOCUMENTS
df_new = pd.DataFrame()

#COLUMN NAMES FOR THE DATA FRAME
cnames = ['p_id', 'p_name', ']
for path, subdirs, files in os.walk(r"A:\SILPDFT"): 
    for name in files:
# For each file we find, we need to ensure it is a PDF file before 
    adding it to our list
        if os.path.splitext(os.path.join(path, name))[1] == ".pdf":
            if not name.startswith('~$'):
 # APPEND ALL PDF DOCUMENT IN THE FOLDER TO THE DOCUMENT LIST
                document_list.append(os.path.join(path, name))
 # THE SECOND LOOP STARTS HERE IT LOOPS THROUGH EACH DOCUMENT
#  IT THEN USED THE PDF CONVERT FUNCTION TO GET TEXT OUT OF PDF   
for document_path in document_list:
    #EMPTY LIST TO GET THE SCRAPPED DATA
    data = []
    # GET THAT PDF TO TEXT
    try:
        document = convert_pdf_to_txt(document_path)
    except:
        continue
    dfObj = pd.DataFrame()
    # IN THIS EXAMPLE MY FILE NAME CONSIST OF A PERSONAL ID e.g.         
     123455_mydetails.pdf
    #BELOW CODE WILL EXTRACT THAT ID FROM THE DOCUMENT PATH
    pid = os.path.basename(document_path)
    #SPLIT AND GET THE ID
    listn = pid.split("_")
    if len(listn) == 1:
        pidn = None
    elif len(listn) > 3:
        pidn= listn[0]
    else:
        pidn = None
        pass
    #BELOW REGULAR EXPRESSIONS(two) ARE LOOKING FOR PERSONS ID IN THE 
     DOCUMENT AND STRING THEM IN A VARIABLE
    result = re.search('licencee licence  number: (.*)\n\n', document)    
    resulta = re.search('ID NUMBER for(.*)', document) 
    #IF WE MATCH AND HIT FIRST REGULAR EXPRESSION PATTERN THEN IT IS 
     THE ID
    if result:
        p_id = (result.group(1))
     #IF WE MATCH AND HIT SECOND REGULAR EXPRESSION PATTERN THEN IT IS 
     THE ID     
    elif resulta:
        p_id_a = resulta.group(0)
        p_id = re.findall(r'\d+',p_id_a)
     #IF BOTH PATTERNS DO NOT MATCH ARE NOT FOUND THEN GET THE ID     
     DERIVED FROM FILE NAME
    else:
        p_id = pidn
    #NOW EXTRACT TH LICENSEE NAME
    result1 = re.search('Licensee name:(.*)\n\n', document)
    # IF ABOVE REGULAR EXPRESSES MATCHES EXTRACT GROUP1    
    if result1:
        p_name = (result1.group(1)) 
    # IF NOT GO ONE STEP ABOVE AND IF THAT PATTERN MATCHES
    # IF YES THEN FIND FOR THE ANOTHER REGULAR EXPRESSION      
    elif resulta:
        resultb = re.search('License for(.*)\s\(?\d+?', 
                             document)
        if resultb:
            p_name = resultb.group(1)
        else:
            p_name = resulta.group(1)
    #IF ALL ELSE FAILS NAME IS NULL
    else:
        p_name = None 
   # APPEND THE EXTRACTED INFORMATION TO THE LIST
   data.append([p_id])
   data.append([p_name])
   # CONVERT THE LIST TO A DATA FRAME
   df = pd.DataFrame(data)
   #TRANSPOSE THAT DATA FRAME TO GET SEPARATE COLUMN
   df_tr= df.transpose()
   #NAME/RENAME THE COLUMNS
   df_tr.rename(columns = {0:'p_id', 1:'p_name'}, inplace = True)
   #APPEND THE STAGING DATA FRAME TO THE MAIN DATA FRAME
   df_new = df_new.append(df_tr)