SELECT COUNT(*) FROM (SELECT DISTINCT fundname, firstinvdate FROM fundbase1 WHERE undisclosedflag = 0)a;
--27097
You can see that fundname, firstinvdate is a good key. But we're going to use simply the fundname as a key because it will be easier to do join operations later.
CREATE TABLE fundbasecore AS
SELECT *
FROM fundbase1 WHERE undisclosedflag = 0;
--27097
SELECT COUNT(*) FROM (SELECT DISTINCT fundname FROM fundbase1 WHERE undisclosedflag = 0)a;
--27050
The plan is to grab all the duplicate fundnames and only include the ones with the MIN(closedate) AND MIN(lastinvdate) in the fundbasecore table.
DROP TABLE fundnameexclude;
CREATE TABLE fundnameexclude AS
SELECT fundname, COUNT(*) FROM (SELECT fundname FROM fundbase1 WHERE undisclosedflag = 0)a
GROUP BY fundname
HAVING COUNT(*) > 1;
--47
DROP TABLE fundexclude;
CREATE TABLE fundexclude AS
SELECT f.*
FROM fundbase1 AS f
INNER JOIN fundnameexclude as e ON f.fundname = e.fundname;
--94
DROP TABLE fundbase2;
CREATE TABLE fundbase2 AS
SELECT *
FROM fundbase1 WHERE undisclosedflag = 0
EXCEPT
SELECT *
FROM fundexclude;
--27003
DROP TABLE fundinclude;
CREATE TABLE fundinclude AS
SELECT fundname, MIN(closedate) AS closedate, MIN(lastinvdate) AS lastinvdate
FROM fundexclude
GROUP BY fundname;
--47
DROP TABLE fundinclude2;
CREATE TABLE fundinclude2 AS
SELECT f.*
FROM fundbase1 AS f
INNER JOIN fundinclude AS fu ON f.fundname = fu.fundname AND f.closedate = fu.closedate AND f.lastinvdate = fu.lastinvdate;
--44
--create fundcore table
DROP TABLE fundbasecore;
CREATE TABLE fundbasecore AS
SELECT * FROM fundbase2
UNION ALL
SELECT * FROM fundinclude2;
--27047
==Name based matching firms to funds==
Get the firms and fund keys and also include the firmname from the fundbasecore table. Run these two files through the Matcher. Then manually flag the multiple matches. There are only ~50 of them. Then reimport to vcdb2.