
Environment settings
Code
# Import libraries
import numpy as np
import pandas as pd
import polars as pl
import gspread
import duckdb
import sqlalchemy as db
import pyodbc
from oauth2client.service_account import ServiceAccountCredentials
from google.oauth2 import service_account
from google.cloud import bigquery
import connectorx as cx
import warnings
warnings.filterwarnings('ignore')
Code
scopes = ['https://www.googleapis.com/auth/spreadsheets',
'https://www.googleapis.com/auth/drive',
'https://www.googleapis.com/auth/analytics.readonly']
# Read google credentials
api = 'creds.json'
# connect to google sheets
gs_credentials = ServiceAccountCredentials.from_json_keyfile_name(api, scopes)
gc = gspread.authorize(gs_credentials)
# connect to big query
bq_credentials = service_account.Credentials.from_service_account_file(api)
project_id = 'repository'
client = bigquery.Client(credentials=bq_credentials,project=project_id)
Load Phase
There are two primary options for loading data:
Create a staging table and load the transformed data into it for temporary storage before validating and potentially modifying it:
Load the data directly into your target table, bypassing the staging step. However, this approach can be less flexible for complex transformations:
Code
# create dataset
client.create_dataset('database')
Dataset(DatasetReference('gepp-538', 'database'))
Code
# convert to pandas
sheet = sheet.to_pandas()
# upload to big query
sheet.to_gbq('dw.transformation.catalog',
project_id='repository-538',
if_exists='replace',
credentials=bq_credentials)
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 7626.01it/s]
Execute queries from Big Query
Code
# create sql query
query = '''
SELECT *
FROM `dw.transformation.catalog`
'''
# convert query to pandas dataframe
catalog = pd.read_gbq(query, credentials=bq_credentials)