Difference between revisions of "Retrieving US VC Data From SDC"

From edegan.com
Jump to navigation Jump to search
 
(37 intermediate revisions by the same user not shown)
Line 270: Line 270:
 
  VCFunds1980-present.rpt                                                   
 
  VCFunds1980-present.rpt                                                   
  
Created: 6/16/2017 3:28:23 PM
+
Created: 9/28/2017 3:16:59 PM
  
 
Session Contents:
 
Session Contents:
Line 333: Line 333:
 
===VC Firms===
 
===VC Firms===
  
Core Files:
+
Core Files:                                                                        
  USVCFirmsOnly1980-present.rpt                                                   
+
  USVCFirms1980-Present.rpt                                                   
  USVCFirmsOnly1980-present.ssh                                                   
+
  USVCFirms1980-Present.ssh                                                   
  USVCFirmsOnly1980-present.txt                                                   
+
  USVCFirms1980-Present.txt                                                   
  USVCFirmsOnly-present-normal.txt                                             
+
  USVCFirms1980-Present-normal.txt                                             
  
 
Created: 9/29/2017 3:46:06 PM
 
Created: 9/29/2017 3:46:06 PM
Line 349: Line 349:
 
Custom Report Contents:
 
Custom Report Contents:
 
  FIRM_NAME
 
  FIRM_NAME
 +
VI_YEARFOUND
 
  FIRM_FIRST_INV_DATE
 
  FIRM_FIRST_INV_DATE
 
  FIRM_LAST_INV_DATE
 
  FIRM_LAST_INV_DATE
VI_YEARFOUND
 
 
  VI_STREET1
 
  VI_STREET1
 
  VI_STREET2
 
  VI_STREET2
 
  VI_STREET3
 
  VI_STREET3
 +
VI_CITY
 +
VI_ZIP
 +
FIRM_AREA_CODE
 +
FIRM_COUNTY
 +
VI_STATE
 +
VI_NATION
 
  VI_NATION_LONG
 
  VI_NATION_LONG
  VI_NATION
+
  FIRM_WREG1_DESC
 
  FIRM_NUM_COMPANIES
 
  FIRM_NUM_COMPANIES
 
  FIRM_NUM_ROUNDS
 
  FIRM_NUM_ROUNDS
 
  FIRM_TOTIN
 
  FIRM_TOTIN
FIRM_AREA_CODE
 
 
  FIRM_CAP_MGT
 
  FIRM_CAP_MGT
VI_CITY
 
FIRM_COUNTY
 
 
  FIRM_STATUS_DESC
 
  FIRM_STATUS_DESC
  FIRM_MSA_DESC
+
  FIRM_MSA_CODE
 
  FIRM_ROLE_DESC
 
  FIRM_ROLE_DESC
 +
FIRM_GEOGRAPHY_PREF_DESC
 +
FIRM_INDUSTRY_PREF_DESC
 
  FIRM_STAGE_PREF_DESC
 
  FIRM_STAGE_PREF_DESC
VI_STATE
 
 
  FIRM_TYPE_DESC
 
  FIRM_TYPE_DESC
FIRM_WREG1_DESC
 
VI_ZIP
 
  
LoadFirms.sql
+
LoadFirms2.sql
DROP TABLE firmbase;
 
CREATE TABLE firmbase (
 
  firmname varchar(255),
 
  foundingdate date, --mm-dd-yyyy
 
  location varchar(100),
 
  nation varchar(100),
 
  nationcode varchar(10),
 
  datelastinv date, --mm-dd-yyyy
 
  datefirstinv date, --mm-dd-yyyy
 
  numportcos integer,
 
  numrounds integer,
 
  investedk money,
 
  areacode integer,
 
  bocity varchar(100),
 
  bonation varchar(100),
 
  bonationcode varchar(100),
 
  bostatecode varchar(100),
 
  boaddr1 varchar(100),
 
  boaddr2 varchar(100),
 
  bozip varchar(10),
 
  capitalundermgmt money,
 
  city varchar(100),
 
  county varchar(100),
 
  invstatus varchar(100),
 
  msacode varchar(10),
 
  geogpref varchar(100),
 
  indpref varchar(100),
 
  rolepref varchar(100),
 
  stagepref varchar(100),
 
  statecode varchar(2),
 
  addr1 varchar(100),
 
  addr2 varchar(100),
 
  type varchar(100),
 
  worldregion varchar(100),
 
  zip varchar(10)
 
);
 
 
\COPY firmbase FROM 'VCFirms1980-present-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
 
  
LoadFirms.sql
 
 
  DROP TABLE firmbase;
 
  DROP TABLE firmbase;
  CREATE TABLE firmbase (
+
  CREATE TABLE firmbase(
  firmname varchar(255),
+
  firmname varchar(255),
  foundingdate date, --mm-dd-yyyy
+
  foundingdate date, --mm-dd-yyyy
  location varchar(100),
+
  datefirstinv date, --mm-dd-yyyy
  nation varchar(100),
+
  datelastinv date, --mm-dd-yyyy
  nationcode varchar(10),
+
  addr1 varchar(100),
  datelastinv date, --mm-dd-yyyy
+
  addr2 varchar(100),
  datefirstinv date, --mm-dd-yyyy
+
  location varchar(100),
  numportcos integer,
+
  city varchar(100),
  numrounds integer,
+
  zip varchar(10),
  investedk money,
+
  areacode integer,
  areacode integer,
+
  county varchar(100),
  bocity varchar(100),
+
  state varchar(2),
  bonation varchar(100),
+
  nationcode varchar(10),
  bonationcode varchar(100),
+
  nation varchar(100),
  bostatecode varchar(100),
+
  worldregion varchar(100),
  boaddr1 varchar(100),
+
  numportcos integer,
  boaddr2 varchar(100),
+
  numrounds integer,
  bozip varchar(10),
+
  investedk money,
  capitalundermgmt money,
+
  capitalundermgmt money,
  city varchar(100),
+
  invstatus varchar(100),
  county varchar(100),
+
  msacode varchar(10),
  invstatus varchar(100),
+
  rolepref varchar(100),
  msacode varchar(10),
+
  geogpref varchar(100),
  geogpref varchar(100),
+
  indpref varchar(100),
  indpref varchar(100),
+
  stagepref varchar(100),
  rolepref varchar(100),
+
  type varchar(100)
  stagepref varchar(100),
+
 
  statecode varchar(2),
 
  addr1 varchar(100),
 
  addr2 varchar(100),
 
  type varchar(100),
 
  worldregion varchar(100),
 
  zip varchar(10)
 
 
  );
 
  );
 
   
 
   
  \COPY firmbase FROM 'VCFirms1980-present-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
+
  \COPY firmbase FROM 'USVCFirms1980-Present-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS '' CSV
 +
--9734
  
 
===Branch Offices===
 
===Branch Offices===
 
Core Files
 
Core Files
  USVCFirmBranchOffices.rpt                                                   
+
  USVCFirmBranchOffices1980-Present.rpt                                                   
  USVCFirmBranchOffices.ssh                                                   
+
  USVCFirmBranchOffices1980-Present.ssh                                                   
  USVCFirmBranchOffices.txt                                                   
+
  USVCFirmBranchOffices1980-Present.txt                                                   
  USVCFirmBranchOffices-normal.txt   
+
  USVCFirmBranchOffices1980-Present-normal.txt   
  
 
Created: 9/29/2017 3:54:32 PM
 
Created: 9/29/2017 3:54:32 PM
Line 471: Line 429:
 
  FIRM_NAME
 
  FIRM_NAME
 
  VI_CAPITAL_CITY
 
  VI_CAPITAL_CITY
  VI_CAPITAL_STATELONG
+
  VI_CAPITAL_STATE
 
  VI_CAPITAL_COUNTRY
 
  VI_CAPITAL_COUNTRY
 
  VI_CAPITAL_NATION
 
  VI_CAPITAL_NATION
Line 477: Line 435:
 
  VI_ADR_LINE1
 
  VI_ADR_LINE1
 
  VI_ADR_LINE2
 
  VI_ADR_LINE2
 +
 +
LoadBranchOffices.sql
 +
DROP TABLE branchoffices;
 +
CREATE TABLE branchoffices (
 +
  firmname varchar(255),
 +
  bocity varchar(100),
 +
  bostate varchar(2),
 +
  bocountrycode varchar(2),
 +
  bonation varchar(100),
 +
  bozip varchar(10),
 +
  boaddr1 varchar(100),
 +
  boaddr2 varchar(100)
 +
 
 +
);
 +
\COPY branchoffices FROM 'USVCFirmBranchOffices1980-Present-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS  CSV
 +
--9734
  
 
==Company Long Description==
 
==Company Long Description==

Latest revision as of 16:42, 24 October 2017

Retrieving US VC Data From SDC requires SDC platinum, perl scripts and SQL scripts, as well as some manual processing.

Scripts and other info

SDC and Perl Scripts are in:

E:\McNair\Projects\VC Database

This includes:

  • NormalizeFixedWidth.pl
  • RoundOnOneLine.pl - Needed for round on one line

SQL Scripts and finished data are in:

Z:\VentureCapitalData\SDCVCData

Notes:

  • Portfolio companies attributes to be extracted from roundlevel information
  • Round-on-one-line processed using RoundOnOneLine.pl and NormalizeFixedWidth.pl
  • Firms includes branch office so attributes must be extracted
  • Portfolio company descriptions - just the portco name, state, date of first inv, and the long description - has to be custom processed.

The next steps are detailed in VC Database Rebuild.


Some files required some minor post-processing to load into PostgreSQL. Issues included:

  • Firm level data didn't normalize correctly - had to adjust headers
  • Stray quotation in address line(s)
  • Area code had a 1- in it
  • Some line counts were off by one or two
  • "Firm Capital under Mgmt" column header for VCFirms has a {0mil} which screws up the normalizer. Delete this part of the column title prior to running normalizer and make sure to put in the proper number of spaces.
  • VCFirms line 7139 the text 'L"opera' has a stray quotation mark which will prevent the copy into psql table. Remove stray quotation manually.
  • VCFirms line 12461 the text '1-8' has a hyphen which creates and import error into psql table. Remove the hyphen manually.

The company long description output from SDC has the description for each company spread over multiple lines. In order to import into the vcdb we want the entire description on one line for each company. First remove the header from the long description file and name this file Process.txt. Next, to move the company long description to one line use the following bash commands in a linux terminal like on the db server.

cat Process.txt | perl -pe 's/^([^ ])/#\1/g' > Out1.txt
cat Out1.txt | perl -pe 's/\s{100,}/ /g' > Out2.txt
cat Out2.txt | perl -pe 's/\n//g' > Out3.txt
cat Out3.txt | perl -pe 's/#/\n/g' > Out4.txt

You will still have an issue running the normalizer on Out4.txt. I appended 100,000 filler characters to the last column header so that the normalizer will space things properly. However, the final output will have some long descriptions split across two lines. This will cause problems copying to a psql table. You will have to manually adjust these lines. Also remove all quotation marks from the text file because these will just cause trouble.

Build Specs

IPOs

Created: 6/19/2017 2:59:33 PM

Core files:

IPO1980-present.rpt                                                       
IPO1980-present.ssh                                                       
IPO1980-present.txt                                                       
IPO1980-present-normal.txt                                                

Session Contents:

Global New Issues Databases
Issue Date: 1/1/1980 to 06/19/2017 (Custom) (Calendar)
Issuer/Borrower Nation : US
Select All IPOs
Primary Exchange of Issuer's/Borrower's Stock : A, NM, N

Report Contents:

D
I
STC
AMT
RANK1_TOTDOLAMTPRO
NAICS_PRIMARY_CODE
ZIPL
TRANS_STATUS
FOUNDED

LoadIPOs.sql

DROP TABLE ipos;
CREATE TABLE ipos (
  issuedate date, --mm-dd-yy
  issuer varchar(255),
  statecode varchar(10), 
  principalamt money, --million
  proceedsamt money, --sum of all markets in million
  naiccode varchar(255), --primary NAIC code
  zipcode varchar(10),
  status varchar (20),
  foundeddate date
);

\COPY ipos FROM 'IPO1980-present-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS  CSV
--10440

M&As

Created: 6/19/2017 4:16:59 PM

Core files:

MA1980-present.rpt                                                        
MA1980-present.ssh                                                        
MA1980-present.txt                                                        
MA1980-present-normal.txt                                                 

Session Contents:

DATABASES: Domestic Mergers, 1979-Present (MA, OMA)
Date Announced: 1/1/1980 to 06/19/2017 (Custom) (Calendar)
Target Public Status : V
Percent of Shares Owned after Transaction: 100 to 100
Target Nation : US

Custom Report Contents:

TN
DE
TST
AN
ENTVAL
ASTC
TSTC
ATIC
AEXCH
TNAICP
DA
VAL
YEAREFF
TSTR
TCITY
TZIP

LoadMAs.sql

DROP TABLE mas;
CREATE TABLE mas (
  targetname varchar(255),
  effectivedate date,
  targetstate varchar(255),
  acquirorname varchar(255),
  enterpriseval varchar(255), --millions
  acquirorstatecode varchar(10),
  targetstatecode varchar(10),
  acquirorticker varchar(10),
  acquirorexchange varchar(20),
  targetnaiccode varchar(10),
  announceddate date, --mm-dd-yy
  transactionamt money, --$millions
  completedyear varchar(4), --yyyy
  targetaddress varchar(255),
  targetcity varchar(100),
  targetzipcode varchar(10)
);

\COPY mas FROM 'MA1980-present-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS  CSV
--139013

Portfolio Companies & Rounds

Core files:

USVC1980-present-normal.txt                                               
USVC1980-present.rpt                                                      
USVC1980-present.ssh                                                      
USVC1980-present.txt                                                      

Created: 6/15/2017 5:38:16 PM

Session Contents:

DATABASE: Portfolio Companies (VIPC)
Venture Related Deals: Select All Venture Related Deals
Company Nation : US
Round Date: 1/1/1980 to 06/15/2017 (Custom) (Calendar)

Custom Report Contents:

N
RD
VI_UPDATE_STAMP
VI_YEARFOUND
LAST_INV_DATE
FIRST_INV_DATE
TOTIN
VI_CITY
VI_BUSDESC_SHORT
COMPANY_MSA_DESC
COMPANY_MSA_CODE
VI_NATION
VI_STATE
VI_STREET1
VI_STREET2
COMPANY_VEIC1_DESC
COMPANY_VEIC6_DESC
COMPANY_VEIC3_DESC
WEBSITE
VI_ZIP
STAGE1_DESC
STAGE3_DESC
VI_XC_ROUND_AMOUNT
VI_XC_TOT_KNOWN_AMT
VI_FINNO
VI_INVNOS

LoadRoundBase.sql

DROP TABLE roundbase;
CREATE TABLE roundbase (
  coname varchar(255),
  rounddate date,
  updateddate date,
  foundingdate date,
  datelastinv date,
  datefirstinv date,
  investedk real,
  city varchar(100),
  description varchar(5000),
  msa varchar(100),
  msacode varchar(10),
  nationcode varchar(10),
  statecode varchar(10),
  addr1 varchar(100),
  addr2 varchar(100),
  indclass varchar(100),
  indsubgroup3 varchar(100),
  indminor varchar(100),
  url varchar(5000),
  zip varchar(10),
  stage1 varchar(100),
  stage3 varchar(100),
  rndamtdisck real,
  rndamtestk real,
  roundnum integer,
  numinvestors integer
);

\COPY roundbase FROM 'USVC1980-present-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS  CSV
--143348

Round On One Line

Core files:

USVCRound1980-present-NoHead-normal.txt                                   
USVCRound1980-present-NoHead.txt                                          
USVCRound1980-present-normal.txt                                          
USVCRound1980-present-sessiondetails.txt                                  
USVCRound1980-present.rpt                                                 
USVCRound1980-present.ssh                                                 
USVCRound1980-present.txt   

Created: 6/16/2017 4:39:47 PM

Session Contents:

DATABASE: Portfolio Companies (VIPC)
Venture Related Deals: Select All Venture Related Deals
Company Nation : US
Round Date: 1/1/1980 to 06/16/2017 (Custom) (Calendar)

Custom Report Contents:

N
VI_STATE
LAST_INV_DATE
FIRST_INV_DATE
COMBINED_ROUND_INFO

LoadRound.sql

DROP TABLE roundline;
CREATE TABLE roundline (
  coname varchar(255),
  rounddate date,
  amountk real,
  fundname varchar(255)
);

\COPY roundline FROM 'USVCRound1980-present-NoHead-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS  CSV
--385753

VC Funds

Core files:

USVCFunds1980-present.ssh    
VCFund1980-present-normal-NoFoot.txt                                                   
VCFund1980-present-normal.txt                                             
VCFund1980-present-NoFoot.txt
VCFund1980-present.txt                                                    
VCFunds1980-present.rpt                                                   

Created: 9/28/2017 3:16:59 PM

Session Contents:

DATABASE: Private Equity Funds (VIFD)
Venture Related Deals: Select All Venture Related Deals
Fund Portfolio Cos: 1/1/1980 to 06/16/2017 (Custom) (Calendar)

Custom Report Contents:

VI_DISTINCT_FUND_NAME
VI_FIRST_CLOSE_DATE
FUND_LAST_INV_DATE
FUND_FIRST_INV_DATE
FUND_NUM_COMPANIES
FUND_TOTIN
VI_DISTINCT_FUND_CITY
FUND_YEAR
VI_DISTINCT_FUND_ZIP
VI_DISTINCT_FUND_STC
VI_FUND_SIZE
VCPI_STAGE_DESC
FIRMS
INVEST_TYPE_DESC
FUND_MSA_CODE
VI_DISTINCT_FUND_NATC
FUND_RAISE_STATUS_DESC
VCPI_SEQ_NUM
VCPI_SEQ_DESC
VI_TOTAL_TARGET
FUND_TYPE_DESC_SHORT
VI_DISTINCT_FUND_ZIP

LoadFunds.sql

DROP TABLE fundbase;
CREATE TABLE fundbase (
  fundname varchar(255),
  closedate date, --mm-dd-yyyy
  lastinvdate date, --mm-dd-yyyy
  firstinvdate date, --mm-dd-yyyy
  numportcos integer,
  investedk real,
  city varchar(100),
  fundyear varchar(4), --yyyy
  zip varchar(10),
  statecode varchar(2),
  fundsizem real,
  fundstage varchar(100),
  firmname varchar(255),
  lastupdated varchar(255),
  fundnationcode varchar(10),
  invtype varchar(100),
  msacode varchar(100),
  nationcode varchar(10),
  raisestatus varchar(100),
  seqnum integer,
  targetsizem real,
  seqtype varchar(100)
);

\COPY fundbase FROM 'VCFund1980-present-NoFoot-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS  CSV
--27588

VC Firms

Core Files:

USVCFirms1980-Present.rpt                                                   
USVCFirms1980-Present.ssh                                                   
USVCFirms1980-Present.txt                                                   
USVCFirms1980-Present-normal.txt                                            

Created: 9/29/2017 3:46:06 PM

Session Contents:

DATABASE: Private Equity Firms (VIFM)
Venture Related Deals: Select All Venture Related Deals
Firm Portfolio Cos: Round Date: 01/01/1980 to 09/29/2017 (Custom) (Calendar)
Firm Portfolio Cos: Nation : US

Custom Report Contents:

FIRM_NAME
VI_YEARFOUND
FIRM_FIRST_INV_DATE
FIRM_LAST_INV_DATE
VI_STREET1
VI_STREET2
VI_STREET3
VI_CITY
VI_ZIP
FIRM_AREA_CODE
FIRM_COUNTY
VI_STATE
VI_NATION
VI_NATION_LONG
FIRM_WREG1_DESC
FIRM_NUM_COMPANIES
FIRM_NUM_ROUNDS
FIRM_TOTIN
FIRM_CAP_MGT
FIRM_STATUS_DESC
FIRM_MSA_CODE
FIRM_ROLE_DESC
FIRM_GEOGRAPHY_PREF_DESC
FIRM_INDUSTRY_PREF_DESC
FIRM_STAGE_PREF_DESC
FIRM_TYPE_DESC

LoadFirms2.sql

DROP TABLE firmbase;
CREATE TABLE firmbase(
 firmname varchar(255),
 foundingdate date, --mm-dd-yyyy
 datefirstinv date, --mm-dd-yyyy  
 datelastinv date, --mm-dd-yyyy
 addr1 varchar(100),
 addr2 varchar(100),
 location varchar(100),
 city varchar(100),
 zip varchar(10),
 areacode integer,
 county varchar(100),
 state varchar(2),
 nationcode varchar(10),
 nation varchar(100),
 worldregion varchar(100),
 numportcos integer,
 numrounds integer,
 investedk money,
 capitalundermgmt money,  
 invstatus varchar(100),
 msacode varchar(10),
 rolepref varchar(100),
 geogpref varchar(100),
 indpref varchar(100),
 stagepref varchar(100),
 type varchar(100)
 
);

\COPY firmbase FROM 'USVCFirms1980-Present-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS  CSV
--9734

Branch Offices

Core Files

USVCFirmBranchOffices1980-Present.rpt                                                   
USVCFirmBranchOffices1980-Present.ssh                                                   
USVCFirmBranchOffices1980-Present.txt                                                   
USVCFirmBranchOffices1980-Present-normal.txt   

Created: 9/29/2017 3:54:32 PM

Session Contents:

DATABASE: Private Equity Firms (VIFM)
Venture Related Deals: Select All Venture Related Deals
Firm Portfolio Cos: Round Date: 01/01/1980 to 09/29/2017 (Custom) (Calendar)
Firm Portfolio Cos: Nation : US

Custom Report Contents:

FIRM_NAME
VI_CAPITAL_CITY
VI_CAPITAL_STATE
VI_CAPITAL_COUNTRY
VI_CAPITAL_NATION
VI_POSTAL_CODE
VI_ADR_LINE1
VI_ADR_LINE2

LoadBranchOffices.sql

DROP TABLE branchoffices;
CREATE TABLE branchoffices (
  firmname varchar(255),
  bocity varchar(100),
  bostate varchar(2),
  bocountrycode varchar(2),
  bonation varchar(100),
  bozip varchar(10),
  boaddr1 varchar(100),
  boaddr2 varchar(100)
 
);
\COPY branchoffices FROM 'USVCFirmBranchOffices1980-Present-normal.txt' WITH DELIMITER AS E'\t' HEADER NULL AS  CSV
--9734

Company Long Description

DROP TABLE longdescription; CREATE TABLE longdescription (

 coname varchar(255),
 statecode varchar(10),
 fundingdate date, -- Date Company Received First Investment
 codescription varchar(10000) --long description

);

\COPY longdescription FROM 'ResultOut.txt' WITH DELIMITER AS E'\t' HEADER NULL AS CSV --44793