End-to-end SQL data cleaning and analysis using SSIS and SQL Server
View the Project on GitHub ZackPBI/SQL-Data-Analysis-Project
π Back to Portfolio
In this project, i went through SQL Server Integration Services (SSIS) to import data from CSV files into SQL Server Database, then i went through cleaning data and making it ready to be used for analytics. And then last, I explored the imported data with SQL for insights.
The dataset is from SBA Open Data

select *
into sba_naics_sector_codes_description
from(
SELECT
[NAICS_Industry_Description],
iif([NAICS_Industry_Description] like '%β%', substring([NAICS_Industry_Description], 8, 2 ), '') LookupCodes,
iif([NAICS_Industry_Description] like '%β%', ltrim(substring([NAICS_Industry_Description], CHARINDEX('β', [NAICS_Industry_Description]) + 1, LEN([NAICS_Industry_Description]) )), '') Sector
FROM [PortfolioDB].[dbo].[sba_industry_standards]
where [NAICS_Codes] = ''
) main
where
LookupCodes != ''
SELECT [NAICS_Industry_Description]
,[LookupCodes]
,[Sector]
FROM [PortfolioDB].[dbo].[sba_naics_sector_codes_description]
order by LookupCodes
insert into [dbo].[sba_naics_sector_codes_description]
values
('Sector 31 β 33 β Manufacturing', 32, 'Manufacturing'),
('Sector 31 β 33 β Manufacturing', 33, 'Manufacturing'),
('Sector 44 - 45 β Retail Trade', 45, 'Retail Trade'),
('Sector 48 - 49 β Transportation and Warehousing', 49, 'Transportation and Warehousing')
update [dbo].[sba_naics_sector_codes_description]
set Sector = 'Manufacturing'
where LookupCodes = 31
Select count(LoanNumber) as Loans_Approved
, sum(InitialApprovalAmount) Total_Net_Dollars
, AVG(InitialApprovalAmount) Average_Loan_Size,
(select count(distinct (OriginatingLender))from [dbo].[sba_public_data])Total_Originating_Lender_Count
from [dbo].[sba_public_data]
order by 3 desc
Select count(LoanNumber) as Loans_Approved
, sum(InitialApprovalAmount) Total_Net_Dollars
, AVG(InitialApprovalAmount) Average_Loan_Size,
(select count(distinct (OriginatingLender))from [dbo].[sba_public_data] where year(DateApproved) = 2021)Total_Originating_Lender_Count
from [dbo].[sba_public_data]
where year(DateApproved) = 2021
order by 3 desc
Select count(LoanNumber) as Loans_Approved
, sum(InitialApprovalAmount) Total_Net_Dollars
, AVG(InitialApprovalAmount) Average_Loan_Size,
(select count(distinct (OriginatingLender))from [dbo].[sba_public_data] where year(DateApproved) = 2020)Total_Originating_Lender_Count
from [dbo].[sba_public_data]
where year(DateApproved) = 2020
order by 3 desc
βData is ordered by Net_Dollars
Select top 15 OriginatingLender
, count(LoanNumber) as Loans_Approved
, sum(InitialApprovalAmount) Net_Dollars
, AVG(InitialApprovalAmount) Average_Loan_Size
from [dbo].[sba_public_data]
where year(DateApproved) = 2021
group by OriginatingLender
order by 3 desc
Select top 15 OriginatingLender
, count(LoanNumber) as Loans_Approved
, sum(InitialApprovalAmount) Net_Dollars
, AVG(InitialApprovalAmount) Average_Loan_Size
from [dbo].[sba_public_data]
where year(DateApproved) = 2020
group by OriginatingLender
order by 3 desc
with cte as (
select ncd.Sector
, count(LoanNumber) as Loans_Approved
, sum(CurrentApprovalAmount) Net_Dollars
from [dbo].[sba_public_data] main
inner join [dbo].[sba_naics_sector_codes_description] ncd
on left(cast(main.NAICSCode as varchar), 2) = ncd.LookupCode
where year(DateApproved) = 2021
group by ncd.Sector
--order by 3 desc
)
SELECT
sector,
Loans_Approved,
SUM(Net_Dollars) OVER(PARTITION BY sector) AS Net_Dollars,
--SUM(Net_Dollars) OVER() AS Total,
CAST(1. * Net_Dollars / SUM(Net_Dollars) OVER() AS DECIMAL(5,2)) * 100 AS "Percent by Amount"
FROM cte
order by 3 desc
--where year(DateApproved) = 2021
select BorrowerState as state
, count(LoanNumber) as Loan_Count
, sum(CurrentApprovalAmount) Net_Dollars
from [dbo].[sba_public_data] main
--where cast(DateApproved as date) < '2021-06-01'
group by BorrowerState
order by 1
select race
, count(LoanNumber) as Loan_Count
, sum(CurrentApprovalAmount) Net_Dollars
from [dbo].[sba_public_data]
group by race
order by 3
select gender
, count(LoanNumber) as Loan_Count
, sum(CurrentApprovalAmount) Net_Dollars
from [dbo].[sba_public_data]
group by gender
order by 3
select Ethnicity
, count(LoanNumber) as Loan_Count
, sum(CurrentApprovalAmount) Net_Dollars
from [dbo].[sba_public_data]
group by Ethnicity
order by 3
select Veteran, count(LoanNumber) as Loan_Count, sum(CurrentApprovalAmount) Net_Dollars
from [dbo].[sba_public_data]
group by Veteran
order by 3
select count(LoanNumber) as Count_of_Payments
, sum(ForgivenessAmount) Forgiveness_amount_paid
from sba_public_data
where year(DateApproved) = 2020 and ForgivenessAmount <> 0
Select count(LoanNumber) as Loans_Approved
, sum(InitialApprovalAmount) Total_Net_Dollars
, sum(ForgivenessAmount) Forgiveness_amount_paid,
(select count(distinct (OriginatingLender))from [dbo].[sba_public_data] where year(DateApproved) = 2021)Total_Originating_Lender_Count
from [dbo].[sba_public_data]
where year(DateApproved) = 2020
order by 3 desc
select Year(DateApproved) Year_Approved
, Month(DateApproved)Month_Approved
, ProcessingMethod, sum(CurrentApprovalAmount) Net_Dollars
from sba_public_data
group by Year(DateApproved), Month(DateApproved), ProcessingMethod
order by 4 desc