Patentdata:Column | Type | Modifiers---------+-------------------+----------- patent | integer | kind | character varying | claims | integer | apptype | integer | appnum | integer | gdate | date | gyear | integer | appdate | date | appyear | integer | Column Names: patent int, kind varchar, claims int, apptype int, appnum int, gdate date, gyear int, appdate date, appyear int Patent_2015: Column | Type | Modifiers -------------------------------+---------+----------- patentnumber | int | not null kind | varchar | grantdate | date | type | varchar | applicationnumber | varchar | filingdate | date | prioritydate | date | prioritycountry | varchar | prioritypatentnumber | varchar | ussubclass | varchar | maingroup | varchar | subgroup | varchar | cpcsubclass | varchar | cpcmaingroup | varchar | cpcsubgroup | varchar | classificationnationalcountry | varchar | classificationnationalclass | varchar | title | varchar | numberofclaims | int | primaryexaminerfirstname | varchar | primaryexaminerlastname | varchar | primaryexaminerdepartment | varchar | pctpatentnumber | varchar | filename | varchar | Column Names: patentnumber int, kind varchar, grantdate date, type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar Combined Schema: Column Names: patent int, kind varchar, claims int, apptype int, appnum int, gdate date, gyear int, appdate date, appyear int patentnumber int, -- patent kind varchar, -- kind grantdate date, --gdate type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar claims int, apptype int, appnum int, gyear int, appdate date, appyear int Output Schema: patents CREATE TABLE patents_merged( patentnumber int, kind varchar, grantdate date, type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar, claims int, apptype int, appnum int, gyear int, appdate date, appyear int ); patentdata: INSERT INTO patents_merged ( SELECT patent, kind, gdate, 'NULL', 'NULL', NULL, NULL, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', -1, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', claims, apptype, appnum, gyear, appdate, appyear FROM patents ); -- RESULT : INSERT 0 3984771 patent_2015: INSERT INTO patents_merged ( SELECT patentnumber, kind, grantdate, type, applicationnumber, filingdate, prioritydate, prioritycountry, prioritypatentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass, title, numberofclaims, primaryexaminerfirstname, primaryexaminerlastname, primaryexaminerdepartment, pctpatentnumber, filename, -1, -1, -1, -1, NULL, -1 FROM patents ); -- RESULT : INSERT 0 1646225 COPY SCRIPTS:patentdata:\COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;--COPY 3984771 patent_2015:\COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;--COPY 1646225 PATENTS TABLE\COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;-- RESULT : COPY 3984771\COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;-- RESULT : COPY 1646225 TESTING:select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t;--RESULT: 5411151EXPECTED: 5426566 SELECT COUNT(*), *FROM patentsGROUP BY patentnumber, kind, grantdate, type, applicationnumber, filingdate, prioritydate, prioritycountry, prioritypatentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass, title, numberofclaims, primaryexaminerfirstname, primaryexaminerlastname, primaryexaminerdepartment, pctpatentnumber, filename, claims, apptype, appnum, gyear, appdate, appyearHAVING COUNT(*) > 1; SELECT patentnumber, count(*)FROM patentsGROUP BY patentnumberHAVING count(*)>1;--7640598 SELECT * FROM patents opWHERE op.patentnumber IN ( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)>1 )ORDER BY op.patentnumber; (SELECT * INTO patentsCleanedFROM patents opWHERE op.patentnumber IN ( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)=1 )ORDER BY op.patentnumber)--SELECT 5191306 INSERT INTO patentsCleaned(SELECT * FROM patents opWHERE op.patentnumber IN ( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)>1 )AND op.applicationnumber NOT LIKE 'NULL'ORDER BY op.patentnumber); --219845 TESTING:allpatent=# select count(*) from patentsCleaned; count--------- 5411151(1 row) allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1; count | patentnumber-------+--------------(0 rows) ********** INDEX CREATION ************** ALTER TABLE patents ADD PRIMARY KEY (patentnumber);-- RESULT : ALTER TABLEallpatent=# CREATE UNIQUE INDEX patent_idx ON patents (patentnumber); allpatent=# CREATE INDEX ON assignees (orgname);CREATE INDEX **Patents Patentdata:Column | Type | Modifiers---------+-------------------+----------- patent | integer | kind | character varying | claims | integer | apptype | integer | appnum | integer | gdate | date | gyear | integer | appdate | date | appyear | integer | Column Names: patent int, kind varchar, claims int, apptype int, appnum int, gdate date, gyear int, appdate date, appyear int Patent_2015: Column | Type | Modifiers -------------------------------+---------+----------- patentnumber | int | not null kind | varchar | grantdate | date | type | varchar | applicationnumber | varchar | filingdate | date | prioritydate | date | prioritycountry | varchar | prioritypatentnumber | varchar | ussubclass | varchar | maingroup | varchar | subgroup | varchar | cpcsubclass | varchar | cpcmaingroup | varchar | cpcsubgroup | varchar | classificationnationalcountry | varchar | classificationnationalclass | varchar | title | varchar | numberofclaims | int | primaryexaminerfirstname | varchar | primaryexaminerlastname | varchar | primaryexaminerdepartment | varchar | pctpatentnumber | varchar | filename | varchar | Column Names: patentnumber int, kind varchar, grantdate date, type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar Combined Schema: Column Names: patent int, kind varchar, claims int, apptype int, appnum int, gdate date, gyear int, appdate date, appyear int patentnumber int, -- patent kind varchar, -- kind grantdate date, --gdate type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar claims int, apptype int, appnum int, gyear int, appdate date, appyear int Output Schema: patents CREATE TABLE patents_merged( patentnumber int, kind varchar, grantdate date, type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar, claims int, apptype int, appnum int, gyear int, appdate date, appyear int ); patentdata: INSERT INTO patents_merged ( SELECT patent, kind, gdate, 'NULL', 'NULL', NULL, NULL, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', -1, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', claims, apptype, appnum, gyear, appdate, appyear FROM patents ); -- RESULT : INSERT 0 3984771 patent_2015: INSERT INTO patents_merged ( SELECT patentnumber, kind, grantdate, type, applicationnumber, filingdate, prioritydate, prioritycountry, prioritypatentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass, title, numberofclaims, primaryexaminerfirstname, primaryexaminerlastname, primaryexaminerdepartment, pctpatentnumber, filename, -1, -1, -1, -1, NULL, -1 FROM patents ); -- RESULT : INSERT 0 1646225 COPY SCRIPTS:patentdata:\COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;--COPY 3984771 patent_2015:\COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;--COPY 1646225 PATENTS TABLE\COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;-- RESULT : COPY 3984771\COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV;-- RESULT : COPY 1646225 TESTING:select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t;--RESULT: 5411151EXPECTED: 5426566 SELECT COUNT(*), *FROM patentsGROUP BY patentnumber, kind, grantdate, type, applicationnumber, filingdate, prioritydate, prioritycountry, prioritypatentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass, title, numberofclaims, primaryexaminerfirstname, primaryexaminerlastname, primaryexaminerdepartment, pctpatentnumber, filename, claims, apptype, appnum, gyear, appdate, appyearHAVING COUNT(*) > 1; SELECT patentnumber, count(*)FROM patentsGROUP BY patentnumberHAVING count(*)>1;--7640598 SELECT * FROM patents opWHERE op.patentnumber IN ( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)>1 )ORDER BY op.patentnumber; (SELECT * INTO patentsCleanedFROM patents opWHERE op.patentnumber IN ( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)=1 )ORDER BY op.patentnumber)--SELECT 5191306 INSERT INTO patentsCleaned(SELECT * FROM patents opWHERE op.patentnumber IN ( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)>1 )AND op.applicationnumber NOT LIKE 'NULL'ORDER BY op.patentnumber); --219845 TESTING:allpatent=# select count(*) from patentsCleaned; count--------- 5411151(1 row) allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1; count | patentnumber-------+--------------(0 rows) ********** INDEX CREATION ************** ALTER TABLE patents ADD PRIMARY KEY (patentnumber);-- RESULT : ALTER TABLEallpatent=# CREATE UNIQUE INDEX patent_idx ON patents (patentnumber); allpatent=# CREATE INDEX ON assignees (orgname);CREATE INDEX **Citations
To Extract Patents with Numbers Only and to Ignore Other RegExes