Source code for cvfe.api.convert.adobe_xfa

# Ours: data
from cvfe.data import functional
from cvfe.data.preprocessor import (
    CopyFile,
    FileTransformCompose,
    MakeContentCopyProtectedMachineReadable,
    CanadaDataframePreprocessor)
from cvfe.data.constant import DocTypes
# Ours: API
from cvfe.api.convert import BASE_SOURCE_DIR
# API
import fastapi
import requests
from fastapi.encoders import jsonable_encoder
# helpers
from typing import Optional
from pathlib import Path
import pandas as pd
import logging
import sys
import os


# config logger
logger = logging.getLogger(__name__)


# FastAPI router to be used by the FastAPI app
router = fastapi.APIRouter(
    prefix='/cvfe/v1/convert/adobe_xfa',
    tags=['adobe_xfa']
)


[docs]def process(src_dir: Path): # path to the output decrypted pdf dst_dir: Path = src_dir.parts[0] / Path('decrypted/') # main code logger.info('↓↓↓ Starting data extraction ↓↓↓') # Canada protected PDF to make machine readable and skip other files compose = { CopyFile(mode='cf'): '.csv', CopyFile(mode='cf'): '.txt', MakeContentCopyProtectedMachineReadable(): '.pdf' } file_transform_compose = FileTransformCompose(transforms=compose) functional.process_directory( src_dir=src_dir.as_posix(), dst_dir=dst_dir.as_posix(), compose=file_transform_compose, file_pattern='*') logger.info('↑↑↑ Finished data extraction ↑↑↑') logger.info('↓↓↓ Starting data loading ↓↓↓') # convert PDFs to pandas dataframes src_dir = dst_dir.as_posix() dataframe = pd.DataFrame() for dirpath, dirnames, all_filenames in os.walk(src_dir): # filter all_filenames filenames = all_filenames if filenames: files = [os.path.join(dirpath, fname) for fname in filenames] # applicant form logger.info('↓↓↓ Starting to process 5257E ↓↓↓') in_fname = [f for f in files if '5257' in f][0] df_preprocessor = CanadaDataframePreprocessor() if len(in_fname) != 0: dataframe_applicant = df_preprocessor.file_specific_basic_transform( path=in_fname, type=DocTypes.canada_5257e) logger.info('↑↑↑ Finished processing 5257E ↑↑↑') # applicant family info logger.info('↓↓↓ Starting to process 5645E ↓↓↓') in_fname = [f for f in files if '5645' in f][0] if len(in_fname) != 0: dataframe_family = df_preprocessor.file_specific_basic_transform( path=in_fname, type=DocTypes.canada_5645e) logger.info('↑↑↑ Finished processing 5645E ↑↑↑') # final dataframe: concatenate common forms and label column wise dataframe = pd.concat( objs=[ dataframe_applicant, dataframe_family], axis=1, verify_integrity=True) # logging logger.info(f'Processed the data point') logger.info('↑↑↑ Finished data loading ↑↑↑') return dataframe
@router.post( '/', status_code=fastapi.status.HTTP_200_OK, tags=['adobe_xfa']) async def convert( form_5257: fastapi.UploadFile, form_5645: fastapi.UploadFile, post_url: Optional[str] = None): try: # save files to disk input_path: Path = BASE_SOURCE_DIR / Path('x/') # create the path if does not exist input_path.mkdir(parents=True, exist_ok=True) with open(input_path / Path('5257.pdf'), 'wb') as f: contents_form_5257 = await form_5257.read() f.write(contents_form_5257) with open(input_path / Path('5645.pdf'), 'wb') as f: contents_form_5645 = await form_5645.read() f.write(contents_form_5645) except Exception as error: logger.exception(error) e = sys.exc_info()[1] raise fastapi.HTTPException( status_code=fastapi.status.HTTP_415_UNSUPPORTED_MEDIA_TYPE, detail=str(e)) try: data: pd.DataFrame = process(src_dir=BASE_SOURCE_DIR) logger.info('Process finished') response = [data.iloc[0].to_dict()] except Exception as error: logger.exception(error) e = sys.exc_info()[1] raise fastapi.HTTPException( status_code=fastapi.status.HTTP_400_BAD_REQUEST, detail=str(e)) try: response_status_code: int = -1 # if third-party url is provided, send post request to that if post_url: # make response jsonable jsonable_response = jsonable_encoder(response) # send the response to create the item in DB post_response = requests.post( url=post_url, json=jsonable_response ) response_status_code = post_response.status_code logger.info(f'post response code {post_response.status_code}') # raise exception if bad status code if not post_response.ok: raise fastapi.HTTPException( status_code=post_response.status_code, detail=post_response.text) return response except Exception as error: logger.exception(error) e = sys.exc_info()[1] raise fastapi.HTTPException( status_code=response_status_code, detail=str(e))