Difference between revisions of "Patent Data Processing - SQL Steps"
(Created page with "Citations To Extract Patents with Numbers Only and to Ignore Other RegExes CREATE OR REPLACE FUNCTION cleanpatno (text) RETURNS bigint AS $$ if ($_[0]) { my $var=$_[0];...") |
|||
Line 1: | Line 1: | ||
− | Citations | + | |
+ | Patentdata: | ||
+ | Column | Type | Modifiers | ||
+ | ---------+-------------------+----------- | ||
+ | patent | integer | | ||
+ | kind | character varying | | ||
+ | claims | integer | | ||
+ | apptype | integer | | ||
+ | appnum | integer | | ||
+ | gdate | date | | ||
+ | gyear | integer | | ||
+ | appdate | date | | ||
+ | appyear | integer | | ||
+ | |||
+ | Column Names: | ||
+ | patent int, | ||
+ | kind varchar, | ||
+ | claims int, | ||
+ | apptype int, | ||
+ | appnum int, | ||
+ | gdate date, | ||
+ | gyear int, | ||
+ | appdate date, | ||
+ | appyear int | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | Patent_2015: | ||
+ | Column | Type | Modifiers | ||
+ | -------------------------------+---------+----------- | ||
+ | patentnumber | int | not null | ||
+ | kind | varchar | | ||
+ | grantdate | date | | ||
+ | type | varchar | | ||
+ | applicationnumber | varchar | | ||
+ | filingdate | date | | ||
+ | prioritydate | date | | ||
+ | prioritycountry | varchar | | ||
+ | prioritypatentnumber | varchar | | ||
+ | ussubclass | varchar | | ||
+ | maingroup | varchar | | ||
+ | subgroup | varchar | | ||
+ | cpcsubclass | varchar | | ||
+ | cpcmaingroup | varchar | | ||
+ | cpcsubgroup | varchar | | ||
+ | classificationnationalcountry | varchar | | ||
+ | classificationnationalclass | varchar | | ||
+ | title | varchar | | ||
+ | numberofclaims | int | | ||
+ | primaryexaminerfirstname | varchar | | ||
+ | primaryexaminerlastname | varchar | | ||
+ | primaryexaminerdepartment | varchar | | ||
+ | pctpatentnumber | varchar | | ||
+ | filename | varchar | | ||
+ | |||
+ | Column Names: | ||
+ | patentnumber int, | ||
+ | kind varchar, | ||
+ | grantdate date, | ||
+ | type varchar, | ||
+ | applicationnumber varchar, | ||
+ | filingdate date, | ||
+ | prioritydate date, | ||
+ | prioritycountry varchar, | ||
+ | prioritypatentnumber varchar, | ||
+ | ussubclass varchar, | ||
+ | maingroup varchar, | ||
+ | subgroup varchar, | ||
+ | cpcsubclass varchar, | ||
+ | cpcmaingroup varchar, | ||
+ | cpcsubgroup varchar, | ||
+ | classificationnationalcountry varchar, | ||
+ | classificationnationalclass varchar, | ||
+ | title varchar, | ||
+ | numberofclaims int, | ||
+ | primaryexaminerfirstname varchar, | ||
+ | primaryexaminerlastname varchar, | ||
+ | primaryexaminerdepartment varchar, | ||
+ | pctpatentnumber varchar, | ||
+ | filename varchar | ||
+ | |||
+ | Combined Schema: | ||
+ | |||
+ | Column Names: | ||
+ | patent int, | ||
+ | kind varchar, | ||
+ | claims int, | ||
+ | apptype int, | ||
+ | appnum int, | ||
+ | gdate date, | ||
+ | gyear int, | ||
+ | appdate date, | ||
+ | appyear int | ||
+ | |||
+ | |||
+ | patentnumber int, -- patent | ||
+ | kind varchar, -- kind | ||
+ | grantdate date, --gdate | ||
+ | type varchar, | ||
+ | applicationnumber varchar, | ||
+ | filingdate date, | ||
+ | prioritydate date, | ||
+ | prioritycountry varchar, | ||
+ | prioritypatentnumber varchar, | ||
+ | ussubclass varchar, | ||
+ | maingroup varchar, | ||
+ | subgroup varchar, | ||
+ | cpcsubclass varchar, | ||
+ | cpcmaingroup varchar, | ||
+ | cpcsubgroup varchar, | ||
+ | classificationnationalcountry varchar, | ||
+ | classificationnationalclass varchar, | ||
+ | title varchar, | ||
+ | numberofclaims int, | ||
+ | primaryexaminerfirstname varchar, | ||
+ | primaryexaminerlastname varchar, | ||
+ | primaryexaminerdepartment varchar, | ||
+ | pctpatentnumber varchar, | ||
+ | filename varchar | ||
+ | claims int, | ||
+ | apptype int, | ||
+ | appnum int, | ||
+ | gyear int, | ||
+ | appdate date, | ||
+ | appyear int | ||
+ | |||
+ | |||
+ | Output Schema: patents | ||
+ | |||
+ | CREATE TABLE patents_merged( | ||
+ | patentnumber int, | ||
+ | kind varchar, | ||
+ | grantdate date, | ||
+ | type varchar, | ||
+ | applicationnumber varchar, | ||
+ | filingdate date, | ||
+ | prioritydate date, | ||
+ | prioritycountry varchar, | ||
+ | prioritypatentnumber varchar, | ||
+ | ussubclass varchar, | ||
+ | maingroup varchar, | ||
+ | subgroup varchar, | ||
+ | cpcsubclass varchar, | ||
+ | cpcmaingroup varchar, | ||
+ | cpcsubgroup varchar, | ||
+ | classificationnationalcountry varchar, | ||
+ | classificationnationalclass varchar, | ||
+ | title varchar, | ||
+ | numberofclaims int, | ||
+ | primaryexaminerfirstname varchar, | ||
+ | primaryexaminerlastname varchar, | ||
+ | primaryexaminerdepartment varchar, | ||
+ | pctpatentnumber varchar, | ||
+ | filename varchar, | ||
+ | claims int, | ||
+ | apptype int, | ||
+ | appnum int, | ||
+ | gyear int, | ||
+ | appdate date, | ||
+ | appyear int | ||
+ | ); | ||
+ | |||
+ | |||
+ | patentdata: | ||
+ | INSERT INTO patents_merged | ||
+ | ( | ||
+ | SELECT | ||
+ | patent, | ||
+ | kind, | ||
+ | gdate, | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | NULL, | ||
+ | NULL, | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | -1, | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | claims, | ||
+ | apptype, | ||
+ | appnum, | ||
+ | gyear, | ||
+ | appdate, | ||
+ | appyear | ||
+ | FROM patents | ||
+ | ); | ||
+ | -- RESULT : INSERT 0 3984771 | ||
+ | |||
+ | patent_2015: | ||
+ | INSERT INTO patents_merged | ||
+ | ( | ||
+ | SELECT | ||
+ | patentnumber, | ||
+ | kind, | ||
+ | grantdate, | ||
+ | type, | ||
+ | applicationnumber, | ||
+ | filingdate, | ||
+ | prioritydate, | ||
+ | prioritycountry, | ||
+ | prioritypatentnumber, | ||
+ | ussubclass, | ||
+ | maingroup, | ||
+ | subgroup, | ||
+ | cpcsubclass, | ||
+ | cpcmaingroup, | ||
+ | cpcsubgroup, | ||
+ | classificationnationalcountry, | ||
+ | classificationnationalclass, | ||
+ | title, | ||
+ | numberofclaims, | ||
+ | primaryexaminerfirstname, | ||
+ | primaryexaminerlastname, | ||
+ | primaryexaminerdepartment, | ||
+ | pctpatentnumber, | ||
+ | filename, | ||
+ | -1, | ||
+ | -1, | ||
+ | -1, | ||
+ | -1, | ||
+ | NULL, | ||
+ | -1 | ||
+ | FROM patents | ||
+ | ); | ||
+ | -- RESULT : INSERT 0 1646225 | ||
+ | |||
+ | COPY SCRIPTS: | ||
+ | patentdata: | ||
+ | \COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV; | ||
+ | --COPY 3984771 | ||
+ | |||
+ | patent_2015: | ||
+ | \COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV; | ||
+ | --COPY 1646225 | ||
+ | |||
+ | PATENTS TABLE | ||
+ | \COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV; | ||
+ | -- RESULT : COPY 3984771 | ||
+ | \COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV; | ||
+ | -- RESULT : COPY 1646225 | ||
+ | |||
+ | |||
+ | TESTING: | ||
+ | select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t; | ||
+ | --RESULT: 5411151 | ||
+ | EXPECTED: 5426566 | ||
+ | |||
+ | SELECT COUNT(*), * | ||
+ | FROM patents | ||
+ | GROUP BY | ||
+ | patentnumber, | ||
+ | kind, | ||
+ | grantdate, | ||
+ | type, | ||
+ | applicationnumber, | ||
+ | filingdate, | ||
+ | prioritydate, | ||
+ | prioritycountry, | ||
+ | prioritypatentnumber, | ||
+ | ussubclass, | ||
+ | maingroup, | ||
+ | subgroup, | ||
+ | cpcsubclass, | ||
+ | cpcmaingroup, | ||
+ | cpcsubgroup, | ||
+ | classificationnationalcountry, | ||
+ | classificationnationalclass, | ||
+ | title, | ||
+ | numberofclaims, | ||
+ | primaryexaminerfirstname, | ||
+ | primaryexaminerlastname, | ||
+ | primaryexaminerdepartment, | ||
+ | pctpatentnumber, | ||
+ | filename, | ||
+ | claims, | ||
+ | apptype, | ||
+ | appnum, | ||
+ | gyear, | ||
+ | appdate, | ||
+ | appyear | ||
+ | HAVING COUNT(*) > 1; | ||
+ | |||
+ | SELECT patentnumber, count(*) | ||
+ | FROM patents | ||
+ | GROUP BY patentnumber | ||
+ | HAVING count(*)>1; | ||
+ | --7640598 | ||
+ | |||
+ | |||
+ | SELECT * | ||
+ | FROM patents op | ||
+ | WHERE op.patentnumber IN | ||
+ | ( | ||
+ | SELECT ip.patentnumber | ||
+ | FROM patents ip | ||
+ | GROUP BY ip.patentnumber | ||
+ | HAVING COUNT(*)>1 | ||
+ | ) | ||
+ | ORDER BY op.patentnumber; | ||
+ | |||
+ | ( | ||
+ | SELECT * | ||
+ | INTO patentsCleaned | ||
+ | FROM patents op | ||
+ | WHERE op.patentnumber IN | ||
+ | ( | ||
+ | SELECT ip.patentnumber | ||
+ | FROM patents ip | ||
+ | GROUP BY ip.patentnumber | ||
+ | HAVING COUNT(*)=1 | ||
+ | ) | ||
+ | ORDER BY op.patentnumber | ||
+ | ) | ||
+ | --SELECT 5191306 | ||
+ | |||
+ | |||
+ | INSERT INTO patentsCleaned( | ||
+ | SELECT * | ||
+ | FROM patents op | ||
+ | WHERE op.patentnumber IN | ||
+ | ( | ||
+ | SELECT ip.patentnumber | ||
+ | FROM patents ip | ||
+ | GROUP BY ip.patentnumber | ||
+ | HAVING COUNT(*)>1 | ||
+ | ) | ||
+ | AND op.applicationnumber NOT LIKE 'NULL' | ||
+ | ORDER BY op.patentnumber | ||
+ | ); | ||
+ | |||
+ | --219845 | ||
+ | |||
+ | TESTING: | ||
+ | allpatent=# select count(*) from patentsCleaned; | ||
+ | count | ||
+ | --------- | ||
+ | 5411151 | ||
+ | (1 row) | ||
+ | |||
+ | allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1; | ||
+ | count | patentnumber | ||
+ | -------+-------------- | ||
+ | (0 rows) | ||
+ | |||
+ | |||
+ | ********** INDEX CREATION ************** | ||
+ | |||
+ | ALTER TABLE patents ADD PRIMARY KEY (patentnumber); | ||
+ | -- RESULT : ALTER TABLE | ||
+ | allpatent=# CREATE UNIQUE INDEX patent_idx ON patents (patentnumber); | ||
+ | |||
+ | allpatent=# CREATE INDEX ON assignees (orgname); | ||
+ | CREATE INDEX | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | **Patents | ||
+ | |||
+ | Patentdata: | ||
+ | Column | Type | Modifiers | ||
+ | ---------+-------------------+----------- | ||
+ | patent | integer | | ||
+ | kind | character varying | | ||
+ | claims | integer | | ||
+ | apptype | integer | | ||
+ | appnum | integer | | ||
+ | gdate | date | | ||
+ | gyear | integer | | ||
+ | appdate | date | | ||
+ | appyear | integer | | ||
+ | |||
+ | Column Names: | ||
+ | patent int, | ||
+ | kind varchar, | ||
+ | claims int, | ||
+ | apptype int, | ||
+ | appnum int, | ||
+ | gdate date, | ||
+ | gyear int, | ||
+ | appdate date, | ||
+ | appyear int | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | Patent_2015: | ||
+ | Column | Type | Modifiers | ||
+ | -------------------------------+---------+----------- | ||
+ | patentnumber | int | not null | ||
+ | kind | varchar | | ||
+ | grantdate | date | | ||
+ | type | varchar | | ||
+ | applicationnumber | varchar | | ||
+ | filingdate | date | | ||
+ | prioritydate | date | | ||
+ | prioritycountry | varchar | | ||
+ | prioritypatentnumber | varchar | | ||
+ | ussubclass | varchar | | ||
+ | maingroup | varchar | | ||
+ | subgroup | varchar | | ||
+ | cpcsubclass | varchar | | ||
+ | cpcmaingroup | varchar | | ||
+ | cpcsubgroup | varchar | | ||
+ | classificationnationalcountry | varchar | | ||
+ | classificationnationalclass | varchar | | ||
+ | title | varchar | | ||
+ | numberofclaims | int | | ||
+ | primaryexaminerfirstname | varchar | | ||
+ | primaryexaminerlastname | varchar | | ||
+ | primaryexaminerdepartment | varchar | | ||
+ | pctpatentnumber | varchar | | ||
+ | filename | varchar | | ||
+ | |||
+ | Column Names: | ||
+ | patentnumber int, | ||
+ | kind varchar, | ||
+ | grantdate date, | ||
+ | type varchar, | ||
+ | applicationnumber varchar, | ||
+ | filingdate date, | ||
+ | prioritydate date, | ||
+ | prioritycountry varchar, | ||
+ | prioritypatentnumber varchar, | ||
+ | ussubclass varchar, | ||
+ | maingroup varchar, | ||
+ | subgroup varchar, | ||
+ | cpcsubclass varchar, | ||
+ | cpcmaingroup varchar, | ||
+ | cpcsubgroup varchar, | ||
+ | classificationnationalcountry varchar, | ||
+ | classificationnationalclass varchar, | ||
+ | title varchar, | ||
+ | numberofclaims int, | ||
+ | primaryexaminerfirstname varchar, | ||
+ | primaryexaminerlastname varchar, | ||
+ | primaryexaminerdepartment varchar, | ||
+ | pctpatentnumber varchar, | ||
+ | filename varchar | ||
+ | |||
+ | Combined Schema: | ||
+ | |||
+ | Column Names: | ||
+ | patent int, | ||
+ | kind varchar, | ||
+ | claims int, | ||
+ | apptype int, | ||
+ | appnum int, | ||
+ | gdate date, | ||
+ | gyear int, | ||
+ | appdate date, | ||
+ | appyear int | ||
+ | |||
+ | |||
+ | patentnumber int, -- patent | ||
+ | kind varchar, -- kind | ||
+ | grantdate date, --gdate | ||
+ | type varchar, | ||
+ | applicationnumber varchar, | ||
+ | filingdate date, | ||
+ | prioritydate date, | ||
+ | prioritycountry varchar, | ||
+ | prioritypatentnumber varchar, | ||
+ | ussubclass varchar, | ||
+ | maingroup varchar, | ||
+ | subgroup varchar, | ||
+ | cpcsubclass varchar, | ||
+ | cpcmaingroup varchar, | ||
+ | cpcsubgroup varchar, | ||
+ | classificationnationalcountry varchar, | ||
+ | classificationnationalclass varchar, | ||
+ | title varchar, | ||
+ | numberofclaims int, | ||
+ | primaryexaminerfirstname varchar, | ||
+ | primaryexaminerlastname varchar, | ||
+ | primaryexaminerdepartment varchar, | ||
+ | pctpatentnumber varchar, | ||
+ | filename varchar | ||
+ | claims int, | ||
+ | apptype int, | ||
+ | appnum int, | ||
+ | gyear int, | ||
+ | appdate date, | ||
+ | appyear int | ||
+ | |||
+ | |||
+ | Output Schema: patents | ||
+ | |||
+ | CREATE TABLE patents_merged( | ||
+ | patentnumber int, | ||
+ | kind varchar, | ||
+ | grantdate date, | ||
+ | type varchar, | ||
+ | applicationnumber varchar, | ||
+ | filingdate date, | ||
+ | prioritydate date, | ||
+ | prioritycountry varchar, | ||
+ | prioritypatentnumber varchar, | ||
+ | ussubclass varchar, | ||
+ | maingroup varchar, | ||
+ | subgroup varchar, | ||
+ | cpcsubclass varchar, | ||
+ | cpcmaingroup varchar, | ||
+ | cpcsubgroup varchar, | ||
+ | classificationnationalcountry varchar, | ||
+ | classificationnationalclass varchar, | ||
+ | title varchar, | ||
+ | numberofclaims int, | ||
+ | primaryexaminerfirstname varchar, | ||
+ | primaryexaminerlastname varchar, | ||
+ | primaryexaminerdepartment varchar, | ||
+ | pctpatentnumber varchar, | ||
+ | filename varchar, | ||
+ | claims int, | ||
+ | apptype int, | ||
+ | appnum int, | ||
+ | gyear int, | ||
+ | appdate date, | ||
+ | appyear int | ||
+ | ); | ||
+ | |||
+ | |||
+ | patentdata: | ||
+ | INSERT INTO patents_merged | ||
+ | ( | ||
+ | SELECT | ||
+ | patent, | ||
+ | kind, | ||
+ | gdate, | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | NULL, | ||
+ | NULL, | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | -1, | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | 'NULL', | ||
+ | claims, | ||
+ | apptype, | ||
+ | appnum, | ||
+ | gyear, | ||
+ | appdate, | ||
+ | appyear | ||
+ | FROM patents | ||
+ | ); | ||
+ | -- RESULT : INSERT 0 3984771 | ||
+ | |||
+ | patent_2015: | ||
+ | INSERT INTO patents_merged | ||
+ | ( | ||
+ | SELECT | ||
+ | patentnumber, | ||
+ | kind, | ||
+ | grantdate, | ||
+ | type, | ||
+ | applicationnumber, | ||
+ | filingdate, | ||
+ | prioritydate, | ||
+ | prioritycountry, | ||
+ | prioritypatentnumber, | ||
+ | ussubclass, | ||
+ | maingroup, | ||
+ | subgroup, | ||
+ | cpcsubclass, | ||
+ | cpcmaingroup, | ||
+ | cpcsubgroup, | ||
+ | classificationnationalcountry, | ||
+ | classificationnationalclass, | ||
+ | title, | ||
+ | numberofclaims, | ||
+ | primaryexaminerfirstname, | ||
+ | primaryexaminerlastname, | ||
+ | primaryexaminerdepartment, | ||
+ | pctpatentnumber, | ||
+ | filename, | ||
+ | -1, | ||
+ | -1, | ||
+ | -1, | ||
+ | -1, | ||
+ | NULL, | ||
+ | -1 | ||
+ | FROM patents | ||
+ | ); | ||
+ | -- RESULT : INSERT 0 1646225 | ||
+ | |||
+ | COPY SCRIPTS: | ||
+ | patentdata: | ||
+ | \COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV; | ||
+ | --COPY 3984771 | ||
+ | |||
+ | patent_2015: | ||
+ | \COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV; | ||
+ | --COPY 1646225 | ||
+ | |||
+ | PATENTS TABLE | ||
+ | \COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV; | ||
+ | -- RESULT : COPY 3984771 | ||
+ | \COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS '' CSV; | ||
+ | -- RESULT : COPY 1646225 | ||
+ | |||
+ | |||
+ | TESTING: | ||
+ | select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t; | ||
+ | --RESULT: 5411151 | ||
+ | EXPECTED: 5426566 | ||
+ | |||
+ | SELECT COUNT(*), * | ||
+ | FROM patents | ||
+ | GROUP BY | ||
+ | patentnumber, | ||
+ | kind, | ||
+ | grantdate, | ||
+ | type, | ||
+ | applicationnumber, | ||
+ | filingdate, | ||
+ | prioritydate, | ||
+ | prioritycountry, | ||
+ | prioritypatentnumber, | ||
+ | ussubclass, | ||
+ | maingroup, | ||
+ | subgroup, | ||
+ | cpcsubclass, | ||
+ | cpcmaingroup, | ||
+ | cpcsubgroup, | ||
+ | classificationnationalcountry, | ||
+ | classificationnationalclass, | ||
+ | title, | ||
+ | numberofclaims, | ||
+ | primaryexaminerfirstname, | ||
+ | primaryexaminerlastname, | ||
+ | primaryexaminerdepartment, | ||
+ | pctpatentnumber, | ||
+ | filename, | ||
+ | claims, | ||
+ | apptype, | ||
+ | appnum, | ||
+ | gyear, | ||
+ | appdate, | ||
+ | appyear | ||
+ | HAVING COUNT(*) > 1; | ||
+ | |||
+ | SELECT patentnumber, count(*) | ||
+ | FROM patents | ||
+ | GROUP BY patentnumber | ||
+ | HAVING count(*)>1; | ||
+ | --7640598 | ||
+ | |||
+ | |||
+ | SELECT * | ||
+ | FROM patents op | ||
+ | WHERE op.patentnumber IN | ||
+ | ( | ||
+ | SELECT ip.patentnumber | ||
+ | FROM patents ip | ||
+ | GROUP BY ip.patentnumber | ||
+ | HAVING COUNT(*)>1 | ||
+ | ) | ||
+ | ORDER BY op.patentnumber; | ||
+ | |||
+ | ( | ||
+ | SELECT * | ||
+ | INTO patentsCleaned | ||
+ | FROM patents op | ||
+ | WHERE op.patentnumber IN | ||
+ | ( | ||
+ | SELECT ip.patentnumber | ||
+ | FROM patents ip | ||
+ | GROUP BY ip.patentnumber | ||
+ | HAVING COUNT(*)=1 | ||
+ | ) | ||
+ | ORDER BY op.patentnumber | ||
+ | ) | ||
+ | --SELECT 5191306 | ||
+ | |||
+ | |||
+ | INSERT INTO patentsCleaned( | ||
+ | SELECT * | ||
+ | FROM patents op | ||
+ | WHERE op.patentnumber IN | ||
+ | ( | ||
+ | SELECT ip.patentnumber | ||
+ | FROM patents ip | ||
+ | GROUP BY ip.patentnumber | ||
+ | HAVING COUNT(*)>1 | ||
+ | ) | ||
+ | AND op.applicationnumber NOT LIKE 'NULL' | ||
+ | ORDER BY op.patentnumber | ||
+ | ); | ||
+ | |||
+ | --219845 | ||
+ | |||
+ | TESTING: | ||
+ | allpatent=# select count(*) from patentsCleaned; | ||
+ | count | ||
+ | --------- | ||
+ | 5411151 | ||
+ | (1 row) | ||
+ | |||
+ | allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1; | ||
+ | count | patentnumber | ||
+ | -------+-------------- | ||
+ | (0 rows) | ||
+ | |||
+ | |||
+ | ********** INDEX CREATION ************** | ||
+ | |||
+ | ALTER TABLE patents ADD PRIMARY KEY (patentnumber); | ||
+ | -- RESULT : ALTER TABLE | ||
+ | allpatent=# CREATE UNIQUE INDEX patent_idx ON patents (patentnumber); | ||
+ | |||
+ | allpatent=# CREATE INDEX ON assignees (orgname); | ||
+ | CREATE INDEX | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | **Citations | ||
To Extract Patents with Numbers Only and to Ignore Other RegExes | To Extract Patents with Numbers Only and to Ignore Other RegExes |
Revision as of 16:36, 14 June 2016
Patentdata: Column | Type | Modifiers
+-------------------+-----------
patent | integer | kind | character varying | claims | integer | apptype | integer | appnum | integer | gdate | date | gyear | integer | appdate | date | appyear | integer |
Column Names:
patent int, kind varchar, claims int, apptype int, appnum int, gdate date, gyear int, appdate date, appyear int
Patent_2015: Column | Type | Modifiers -------------------------------+---------+----------- patentnumber | int | not null kind | varchar | grantdate | date | type | varchar | applicationnumber | varchar | filingdate | date | prioritydate | date | prioritycountry | varchar | prioritypatentnumber | varchar | ussubclass | varchar | maingroup | varchar | subgroup | varchar | cpcsubclass | varchar | cpcmaingroup | varchar | cpcsubgroup | varchar | classificationnationalcountry | varchar | classificationnationalclass | varchar | title | varchar | numberofclaims | int | primaryexaminerfirstname | varchar | primaryexaminerlastname | varchar | primaryexaminerdepartment | varchar | pctpatentnumber | varchar | filename | varchar | Column Names: patentnumber int, kind varchar, grantdate date, type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar
Combined Schema:
Column Names:
patent int, kind varchar, claims int, apptype int, appnum int, gdate date, gyear int, appdate date, appyear int
patentnumber int, -- patent kind varchar, -- kind grantdate date, --gdate type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar claims int, apptype int, appnum int, gyear int, appdate date, appyear int Output Schema: patents CREATE TABLE patents_merged( patentnumber int, kind varchar, grantdate date, type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar, claims int, apptype int, appnum int, gyear int, appdate date, appyear int ); patentdata: INSERT INTO patents_merged ( SELECT patent,
kind, gdate, 'NULL', 'NULL', NULL, NULL, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', -1, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', claims, apptype, appnum, gyear, appdate,
appyear FROM patents ); -- RESULT : INSERT 0 3984771 patent_2015: INSERT INTO patents_merged ( SELECT patentnumber,
kind, grantdate, type, applicationnumber, filingdate, prioritydate, prioritycountry, prioritypatentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass, title, numberofclaims, primaryexaminerfirstname, primaryexaminerlastname, primaryexaminerdepartment, pctpatentnumber, filename, -1, -1, -1, -1, NULL,
-1 FROM patents ); -- RESULT : INSERT 0 1646225
COPY SCRIPTS: patentdata: \COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; --COPY 3984771
patent_2015: \COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; --COPY 1646225
PATENTS TABLE \COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; -- RESULT : COPY 3984771 \COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; -- RESULT : COPY 1646225
TESTING:
select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t;
--RESULT: 5411151
EXPECTED: 5426566
SELECT COUNT(*), * FROM patents GROUP BY patentnumber, kind, grantdate, type, applicationnumber, filingdate, prioritydate, prioritycountry, prioritypatentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass, title, numberofclaims, primaryexaminerfirstname, primaryexaminerlastname, primaryexaminerdepartment, pctpatentnumber, filename, claims, apptype, appnum, gyear, appdate, appyear HAVING COUNT(*) > 1;
SELECT patentnumber, count(*) FROM patents GROUP BY patentnumber HAVING count(*)>1; --7640598
SELECT *
FROM patents op
WHERE op.patentnumber IN
(
SELECT ip.patentnumber
FROM patents ip
GROUP BY ip.patentnumber
HAVING COUNT(*)>1
)
ORDER BY op.patentnumber;
( SELECT * INTO patentsCleaned FROM patents op WHERE op.patentnumber IN ( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)=1 ) ORDER BY op.patentnumber ) --SELECT 5191306
INSERT INTO patentsCleaned(
SELECT *
FROM patents op
WHERE op.patentnumber IN
(
SELECT ip.patentnumber
FROM patents ip
GROUP BY ip.patentnumber
HAVING COUNT(*)>1
)
AND op.applicationnumber NOT LIKE 'NULL'
ORDER BY op.patentnumber
);
--219845
TESTING: allpatent=# select count(*) from patentsCleaned;
count
5411151
(1 row)
allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1;
count | patentnumber
+--------------
(0 rows)
- INDEX CREATION **************
ALTER TABLE patents ADD PRIMARY KEY (patentnumber); -- RESULT : ALTER TABLE allpatent=# CREATE UNIQUE INDEX patent_idx ON patents (patentnumber);
allpatent=# CREATE INDEX ON assignees (orgname); CREATE INDEX
- Patents
Patentdata: Column | Type | Modifiers
+-------------------+-----------
patent | integer | kind | character varying | claims | integer | apptype | integer | appnum | integer | gdate | date | gyear | integer | appdate | date | appyear | integer |
Column Names:
patent int, kind varchar, claims int, apptype int, appnum int, gdate date, gyear int, appdate date, appyear int
Patent_2015: Column | Type | Modifiers -------------------------------+---------+----------- patentnumber | int | not null kind | varchar | grantdate | date | type | varchar | applicationnumber | varchar | filingdate | date | prioritydate | date | prioritycountry | varchar | prioritypatentnumber | varchar | ussubclass | varchar | maingroup | varchar | subgroup | varchar | cpcsubclass | varchar | cpcmaingroup | varchar | cpcsubgroup | varchar | classificationnationalcountry | varchar | classificationnationalclass | varchar | title | varchar | numberofclaims | int | primaryexaminerfirstname | varchar | primaryexaminerlastname | varchar | primaryexaminerdepartment | varchar | pctpatentnumber | varchar | filename | varchar | Column Names: patentnumber int, kind varchar, grantdate date, type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar
Combined Schema:
Column Names:
patent int, kind varchar, claims int, apptype int, appnum int, gdate date, gyear int, appdate date, appyear int
patentnumber int, -- patent kind varchar, -- kind grantdate date, --gdate type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar claims int, apptype int, appnum int, gyear int, appdate date, appyear int Output Schema: patents CREATE TABLE patents_merged( patentnumber int, kind varchar, grantdate date, type varchar, applicationnumber varchar, filingdate date, prioritydate date, prioritycountry varchar, prioritypatentnumber varchar, ussubclass varchar, maingroup varchar, subgroup varchar, cpcsubclass varchar, cpcmaingroup varchar, cpcsubgroup varchar, classificationnationalcountry varchar, classificationnationalclass varchar, title varchar, numberofclaims int, primaryexaminerfirstname varchar, primaryexaminerlastname varchar, primaryexaminerdepartment varchar, pctpatentnumber varchar, filename varchar, claims int, apptype int, appnum int, gyear int, appdate date, appyear int ); patentdata: INSERT INTO patents_merged ( SELECT patent,
kind, gdate, 'NULL', 'NULL', NULL, NULL, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', -1, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', claims, apptype, appnum, gyear, appdate,
appyear FROM patents ); -- RESULT : INSERT 0 3984771 patent_2015: INSERT INTO patents_merged ( SELECT patentnumber,
kind, grantdate, type, applicationnumber, filingdate, prioritydate, prioritycountry, prioritypatentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass, title, numberofclaims, primaryexaminerfirstname, primaryexaminerlastname, primaryexaminerdepartment, pctpatentnumber, filename, -1, -1, -1, -1, NULL,
-1 FROM patents ); -- RESULT : INSERT 0 1646225
COPY SCRIPTS: patentdata: \COPY patents_merged TO '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; --COPY 3984771
patent_2015: \COPY patents_merged TO '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; --COPY 1646225
PATENTS TABLE \COPY patents FROM '/tmp/merged_patents_export1.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; -- RESULT : COPY 3984771 \COPY patents FROM '/tmp/merged_patents_export.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; -- RESULT : COPY 1646225
TESTING:
select count(*) FROM (SELECT DISTINCT patentnumber FROM patents) AS t;
--RESULT: 5411151
EXPECTED: 5426566
SELECT COUNT(*), * FROM patents GROUP BY patentnumber, kind, grantdate, type, applicationnumber, filingdate, prioritydate, prioritycountry, prioritypatentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass, title, numberofclaims, primaryexaminerfirstname, primaryexaminerlastname, primaryexaminerdepartment, pctpatentnumber, filename, claims, apptype, appnum, gyear, appdate, appyear HAVING COUNT(*) > 1;
SELECT patentnumber, count(*) FROM patents GROUP BY patentnumber HAVING count(*)>1; --7640598
SELECT *
FROM patents op
WHERE op.patentnumber IN
(
SELECT ip.patentnumber
FROM patents ip
GROUP BY ip.patentnumber
HAVING COUNT(*)>1
)
ORDER BY op.patentnumber;
( SELECT * INTO patentsCleaned FROM patents op WHERE op.patentnumber IN ( SELECT ip.patentnumber FROM patents ip GROUP BY ip.patentnumber HAVING COUNT(*)=1 ) ORDER BY op.patentnumber ) --SELECT 5191306
INSERT INTO patentsCleaned(
SELECT *
FROM patents op
WHERE op.patentnumber IN
(
SELECT ip.patentnumber
FROM patents ip
GROUP BY ip.patentnumber
HAVING COUNT(*)>1
)
AND op.applicationnumber NOT LIKE 'NULL'
ORDER BY op.patentnumber
);
--219845
TESTING: allpatent=# select count(*) from patentsCleaned;
count
5411151
(1 row)
allpatent=# select count(*), patentnumber FROM patentsCleaned group by patentnumber having count(*) > 1;
count | patentnumber
+--------------
(0 rows)
- INDEX CREATION **************
ALTER TABLE patents ADD PRIMARY KEY (patentnumber); -- RESULT : ALTER TABLE allpatent=# CREATE UNIQUE INDEX patent_idx ON patents (patentnumber);
allpatent=# CREATE INDEX ON assignees (orgname); CREATE INDEX
- Citations
To Extract Patents with Numbers Only and to Ignore Other RegExes CREATE OR REPLACE FUNCTION cleanpatno (text) RETURNS bigint AS $$ if ($_[0]) { my $var=$_[0]; if ($var=~/^\d*$/) {return $var;} return undef; } return undef; $$ LANGUAGE plperl;
- Columns *********************
patentdata:
Column | Type | Modifiers
+-------------------+-----------
patent | integer | cit_date | date | cit_name | character varying | cit_kind | character varying | cit_country | character varying | citation | integer | category | character varying | citseq | integer |
SELECT patent as citingpatentnumber, citation AS citedpatentnumber INTO citations_merged FROM citations; --SELECT 38452957
patent_2015:
Column | Type | Modifiers
+---------+-----------
citingpatentnumber | integer | citingpatentcountry | text | citedpatentnumber | text | citedpatentcountry | text |
SELECT CAST(citingpatentnumber AS bigint), CAST(cleanpatno( citedpatentnumber) AS bigint) as citedpatentnumber INTO citations_merged FROM citations; -- RESULT : SELECT 59227881
FINAL TABLE:
CREATE TABLE citations ( citingpatentnumber bigint, citedpatentnumber bigint );
Copy Statements:
patentdata: \COPY citations_merged TO '/tmp/merged_citations_export1.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; --COPY 38452957
patent_2015: \COPY citations_merged TO '/tmp/merged_citations_export.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; -- RESULT : COPY 59227881
allpatent: \COPY citations FROM '/tmp/merged_citations_export.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; --RESULT : COPY 59227881
\COPY citations FROM '/tmp/merged_citations_export1.txt' DELIMITER AS E'\t' HEADER NULL AS CSV; --RESULT: COPY 38452957
CLONING:
CREATE DATABASE allpatentsProcessed WITH TEMPLATE allpatent OWNER researcher;