Difference between revisions of "Patent Data Cleanup (June 2016)"
Jump to navigation
Jump to search
(Created page with "== About this Page == This page contains the script that was used to clean up the patents and assignees tables in allpatent. == Script ==") |
|||
(12 intermediate revisions by 4 users not shown) | |||
Line 1: | Line 1: | ||
+ | {{Project | ||
+ | |Has project output=Data | ||
+ | |Has sponsor=McNair Center | ||
+ | |Has title=Patent Data Cleanup - June 2016 | ||
+ | |Has owner=Marcela Interiano, | ||
+ | |Has project status=Subsume | ||
+ | |Has keywords=Data | ||
+ | }} | ||
== About this Page == | == About this Page == | ||
This page contains the script that was used to clean up the patents and assignees tables in allpatent. | This page contains the script that was used to clean up the patents and assignees tables in allpatent. | ||
+ | |||
+ | Cleaning up includes: | ||
+ | * Cleaning 'NULL' string and -1 inserts : at the time of merging the patentdata and patent_2015 databases, I inserted 'NULL' strings and -1 in integer columns to differentiate between NULLs that came from the vendor, and 'NULL's that I inserted because of no column overlap. | ||
+ | ** The 'NULL's got replaced with NULL | ||
+ | ** The -1s got replaced with NULL as well. | ||
+ | |||
+ | * Merging some more columns, and dropping unnecessary columns: | ||
+ | ** At the time of merging the tables, some columns, particularly in the patent table, were not merged as they should have been. | ||
+ | ** The script that follows merges those columns as well. | ||
+ | NOTE: The patent data page detailing the SQL steps followed to merge the data now has the updated table structures. The script on this page can be used as a reference when trying to debug any (unlikely) merging errors | ||
+ | |||
+ | * Renaming tables and columns | ||
+ | ** Table names and column names have been standardized. | ||
+ | ** General rule of thumb is : short column names, singular table names (for example : patent and not patents) | ||
== Script == | == Script == | ||
+ | ALTER TABLE patents | ||
+ | RENAME COLUMN patentnumber TO patent; | ||
+ | |||
+ | ALTER TABLE patents | ||
+ | DROP COLUMN kind, | ||
+ | DROP COLUMN title, | ||
+ | DROP COLUMN ussubclass, ** | ||
+ | DROP COLUMN maingroup, -- | ||
+ | DROP COLUMN subgroup,-- | ||
+ | DROP COLUMN cpcsubclass, ++ | ||
+ | DROP COLUMN cpcmaingroup, ++ | ||
+ | DROP COLUMN classificationnationalcountry, | ||
+ | DROP COLUMN classificationnationalclass,** (?) | ||
+ | DROP COLUMN primaryexaminerfirstname, | ||
+ | DROP COLUMN primaryexaminerlastname, | ||
+ | DROP COLUMN primaryexaminerdepartment, | ||
+ | DROP COLUMN filename; | ||
+ | |||
+ | UPDATE patents | ||
+ | SET type = '2015' | ||
+ | WHERE type != 'NULL'; | ||
+ | |||
+ | -- RESULT : UPDATE 1646225 | ||
+ | |||
+ | UPDATE patents | ||
+ | SET type = '2010' | ||
+ | WHERE type = 'NULL'; | ||
+ | -- RESULT : UPDATE 3764926 | ||
+ | |||
+ | /* Join the historical patent data from the US PTO with the patents table */ | ||
+ | |||
+ | ALTER TABLE PATENTS | ||
+ | ADD COLUMN nber INT, | ||
+ | ADD COLUMN uspc varchar, | ||
+ | ADD COLUMN uspc_sub varchar; | ||
+ | |||
+ | UPDATE patents p | ||
+ | SET nber = hp.nber, | ||
+ | uspc = hp.uspc, | ||
+ | uspc_sub = hp.uspc | ||
+ | FROM historicalpatentdata hp | ||
+ | WHERE hp.patentnumber = CAST(p.patent AS varchar); | ||
+ | |||
+ | -- RESULT : UPDATE 5113655 | ||
+ | |||
+ | /* Mergeing some columns - claims and number of claims - column name : claims*/ | ||
+ | UPDATE patents | ||
+ | SET claims = numberofclaims | ||
+ | WHERE claims = -1; | ||
+ | -- RESULTS : UPDATE 1646225 | ||
+ | |||
+ | /* Merging columns - | ||
+ | UPDATE patents | ||
+ | SET appnum = CAST (applicationnumber AS INT) | ||
+ | where appnum = -1; | ||
+ | -- RESULT : UPDATE 1646225 | ||
+ | |||
+ | UPDATE patents | ||
+ | SET appdate = filingdate | ||
+ | where appdate = '0001-01-01 BC' | ||
+ | OR filingdate is not NULL; | ||
+ | |||
+ | -- RESULT UPDATE 1646225 | ||
+ | |||
+ | ALTER TABLE patents | ||
+ | DROP COLUMN apptype; | ||
+ | |||
+ | /* Generating GYear and AppYear from the dates */ | ||
+ | UPDATE patents | ||
+ | SET gyear = EXTRACT(year from grantdate) | ||
+ | WHERE gyear = -1 | ||
+ | AND grantdate IS NOT NULL; | ||
+ | UPDATE 1646225 | ||
+ | |||
+ | UPDATE patents | ||
+ | SET appyear = EXTRACT(year from appdate) | ||
+ | WHERE appyear = -1 | ||
+ | AND appdate is not null; | ||
+ | -- RESULT UPDATE 1646225 | ||
+ | |||
+ | /* Test Script */ | ||
+ | SELECT patentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass FROM Patents LIMIT 100; | ||
+ | |||
+ | patent | integer | not null | ||
+ | grantdate | date | | ||
+ | type | character varying | | ||
+ | applicationnumber | character varying | | ||
+ | filingdate | date | | ||
+ | prioritydate | date | | ||
+ | prioritycountry | character varying | | ||
+ | prioritypatentnumber | character varying | | ||
+ | cpcsubgroup | character varying | | ||
+ | numberofclaims | integer | | ||
+ | pctpatentnumber | character varying | | ||
+ | claims | integer | | ||
+ | appnum | integer | | ||
+ | gyear | integer | | ||
+ | appdate | date | | ||
+ | appyear | integer | | ||
+ | nber | integer | | ||
+ | uspc | character varying | | ||
+ | uspc_sub | character varying | ||
+ | |||
+ | /* Drop the merged columns */ | ||
+ | |||
+ | ALTER TABLE patents | ||
+ | DROP COLUMN numberofclaims, | ||
+ | DROP COLUMN filingdate, | ||
+ | DROP COLUMN applicationnumber, | ||
+ | DROP COLUMN type; | ||
+ | |||
+ | |||
+ | UPDATE patents | ||
+ | SET prioritycountry = NULL | ||
+ | WHERE prioritycountry = 'NULL'; | ||
+ | |||
+ | UPDATE patents | ||
+ | SET pctpatentnumber = NULL | ||
+ | WHERE pctpatentnumber = 'NULL'; | ||
+ | |||
+ | UPDATE patents | ||
+ | SET prioritypatentnumber = NULL | ||
+ | WHERE prioritypatentnumber = 'NULL'; | ||
+ | |||
+ | UPDATE patents | ||
+ | SET cpcsubgroup = NULL | ||
+ | WHERE cpcsubgroup = 'NULL'; | ||
+ | |||
+ | UPDATE patents | ||
+ | SET appnum = NULL | ||
+ | WHERE appnum = -1; | ||
+ | |||
+ | UPDATE patents | ||
+ | SET gyear = NULL | ||
+ | WHERE gyear = -1; | ||
+ | |||
+ | UPDATE patents | ||
+ | |||
+ | SET appyear = NULL | ||
+ | WHERE appyear = -1; | ||
+ | |||
+ | Results: | ||
+ | |||
+ | allpatent=# ALTER TABLE patents | ||
+ | allpatent-# DROP COLUMN numberofclaims, | ||
+ | allpatent-# DROP COLUMN filingdate, | ||
+ | allpatent-# DROP COLUMN applicationnumber, | ||
+ | allpatent-# DROP COLUMN type; | ||
+ | ALTER TABLE | ||
+ | allpatent=# | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE patents | ||
+ | allpatent-# SET prioritycountry = NULL | ||
+ | allpatent-# WHERE prioritycountry = 'NULL'; | ||
+ | ^[[BUPDATE 3764926 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE patents | ||
+ | allpatent-# SET pctpatentnumber = NULL | ||
+ | allpatent-# WHERE pctpatentnumber = 'NULL'; | ||
+ | UPDATE 3764926 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE patents | ||
+ | allpatent-# SET prioritypatentnumber = NULL | ||
+ | allpatent-# WHERE prioritypatentnumber = 'NULL'; | ||
+ | UPDATE 3764926 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE patents | ||
+ | allpatent-# SET cpcsubgroup = NULL | ||
+ | allpatent-# WHERE cpcsubgroup = 'NULL'; | ||
+ | UPDATE 3764926 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE patents | ||
+ | allpatent-# SET appnum = NULL | ||
+ | allpatent-# WHERE appnum = -1; | ||
+ | UPDATE 0 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE patents | ||
+ | allpatent-# SET gyear = NULL | ||
+ | allpatent-# WHERE gyear = -1; | ||
+ | UPDATE 0 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE patents | ||
+ | allpatent-# SET appyear = NULL | ||
+ | allpatent-# WHERE appyear = -1; | ||
+ | UPDATE 0 | ||
+ | allpatent=# | ||
+ | |||
+ | |||
+ | UPDATE assignees | ||
+ | SET lastname = NULL | ||
+ | WHERE lastname = 'null'; | ||
+ | |||
+ | UPDATE assignees | ||
+ | SET firstname = NULL | ||
+ | WHERE firstname = 'null'; | ||
+ | |||
+ | UPDATE assignees | ||
+ | SET address = NULL | ||
+ | WHERE address = 'null'; | ||
+ | |||
+ | UPDATE assignees | ||
+ | SET postcode = NULL | ||
+ | WHERE postcode = 'null'; | ||
+ | |||
+ | UPDATE assignees | ||
+ | SET patentcountry = NULL | ||
+ | WHERE patentcountry = 'null'; | ||
+ | |||
+ | UPDATE assignees | ||
+ | SET nationality2 = NULL | ||
+ | WHERE nationality2 = 'null'; | ||
+ | |||
+ | UPDATE assignees | ||
+ | SET residence = NULL | ||
+ | WHERE residence = 'null'; | ||
+ | |||
+ | UPDATE assignees | ||
+ | SET asgseq = NULL | ||
+ | WHERE asgseq= -1; | ||
+ | |||
+ | UPDATE assignees | ||
+ | SET asgtype = NULL | ||
+ | WHERE asgtype = -1; | ||
+ | |||
+ | RESULTS: | ||
+ | |||
+ | UPDATE assignees | ||
+ | allpatent-# SET lastname = NULL | ||
+ | allpatent-# WHERE lastname = 'null'; | ||
+ | |||
+ | UPDATE 3818842 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE assignees | ||
+ | allpatent-# SET firstname = NULL | ||
+ | allpatent-# WHERE firstname = 'null'; | ||
+ | UPDATE 3818842 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE assignees | ||
+ | allpatent-# SET address = NULL | ||
+ | allpatent-# WHERE address = 'null'; | ||
+ | UPDATE 3818842 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE assignees | ||
+ | allpatent-# SET postcode = NULL | ||
+ | allpatent-# WHERE postcode = 'null'; | ||
+ | UPDATE 3818842 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE assignees | ||
+ | allpatent-# SET patentcountry = NULL | ||
+ | allpatent-# WHERE patentcountry = 'null'; | ||
+ | UPDATE 3818842 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE assignees | ||
+ | allpatent-# SET nationality2 = NULL | ||
+ | allpatent-# WHERE nationality2 = 'null'; | ||
+ | UPDATE 1607714 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE assignees | ||
+ | allpatent-# SET residence = NULL | ||
+ | allpatent-# WHERE residence = 'null'; | ||
+ | UPDATE 1607714 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE assignees | ||
+ | allpatent-# SET asgseq = NULL | ||
+ | allpatent-# WHERE asgseq= -1; | ||
+ | UPDATE 1607714 | ||
+ | allpatent=# | ||
+ | allpatent=# UPDATE assignees | ||
+ | allpatent-# SET asgtype = NULL | ||
+ | allpatent-# WHERE asgtype = -1; | ||
+ | UPDATE 1607714 | ||
+ | allpatent=# | ||
+ | |||
+ | |||
+ | CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser; | ||
+ | |||
+ | == Renaming Tables and Columns == | ||
+ | |||
+ | To standardize table and column names, and to make them as user-friendly as possible, a few tables and columns have been renamed. | ||
+ | * '''allpatent''' database -> '''patent''' | ||
+ | * assignees -> assignee | ||
+ | * judges -> judge | ||
+ | * citations -> citation | ||
+ | * matchassignees -> MatchOrgNames | ||
+ | * patents -> patent | ||
+ | * assignees -> ptoassignee | ||
+ | * assignments -> ptoassignment | ||
+ | * assignors -> ptoassignor | ||
+ | * patentassignment -> ptopatentfile | ||
+ | * properties -> ptoproperty | ||
+ | * mslfee -> feestatus | ||
+ | * patentmaintenancefee -> fee |
Latest revision as of 12:41, 21 September 2020
Patent Data Cleanup (June 2016) | |
---|---|
Project Information | |
Has title | Patent Data Cleanup - June 2016 |
Has owner | Marcela Interiano |
Has start date | |
Has deadline date | |
Has keywords | Data |
Has project status | Subsume |
Subsumed by: | Patent Assignment Data Restructure |
Has sponsor | McNair Center |
Has project output | Data |
Copyright © 2019 edegan.com. All Rights Reserved. |
About this Page
This page contains the script that was used to clean up the patents and assignees tables in allpatent.
Cleaning up includes:
- Cleaning 'NULL' string and -1 inserts : at the time of merging the patentdata and patent_2015 databases, I inserted 'NULL' strings and -1 in integer columns to differentiate between NULLs that came from the vendor, and 'NULL's that I inserted because of no column overlap.
- The 'NULL's got replaced with NULL
- The -1s got replaced with NULL as well.
- Merging some more columns, and dropping unnecessary columns:
- At the time of merging the tables, some columns, particularly in the patent table, were not merged as they should have been.
- The script that follows merges those columns as well.
NOTE: The patent data page detailing the SQL steps followed to merge the data now has the updated table structures. The script on this page can be used as a reference when trying to debug any (unlikely) merging errors
- Renaming tables and columns
- Table names and column names have been standardized.
- General rule of thumb is : short column names, singular table names (for example : patent and not patents)
Script
ALTER TABLE patents RENAME COLUMN patentnumber TO patent;
ALTER TABLE patents DROP COLUMN kind, DROP COLUMN title, DROP COLUMN ussubclass, ** DROP COLUMN maingroup, -- DROP COLUMN subgroup,-- DROP COLUMN cpcsubclass, ++ DROP COLUMN cpcmaingroup, ++ DROP COLUMN classificationnationalcountry, DROP COLUMN classificationnationalclass,** (?) DROP COLUMN primaryexaminerfirstname, DROP COLUMN primaryexaminerlastname, DROP COLUMN primaryexaminerdepartment, DROP COLUMN filename;
UPDATE patents SET type = '2015' WHERE type != 'NULL';
-- RESULT : UPDATE 1646225
UPDATE patents SET type = '2010' WHERE type = 'NULL'; -- RESULT : UPDATE 3764926
/* Join the historical patent data from the US PTO with the patents table */
ALTER TABLE PATENTS ADD COLUMN nber INT, ADD COLUMN uspc varchar, ADD COLUMN uspc_sub varchar;
UPDATE patents p SET nber = hp.nber, uspc = hp.uspc, uspc_sub = hp.uspc FROM historicalpatentdata hp WHERE hp.patentnumber = CAST(p.patent AS varchar);
-- RESULT : UPDATE 5113655
/* Mergeing some columns - claims and number of claims - column name : claims*/ UPDATE patents SET claims = numberofclaims WHERE claims = -1; -- RESULTS : UPDATE 1646225
/* Merging columns - UPDATE patents SET appnum = CAST (applicationnumber AS INT) where appnum = -1; -- RESULT : UPDATE 1646225
UPDATE patents SET appdate = filingdate where appdate = '0001-01-01 BC' OR filingdate is not NULL;
-- RESULT UPDATE 1646225
ALTER TABLE patents DROP COLUMN apptype;
/* Generating GYear and AppYear from the dates */ UPDATE patents SET gyear = EXTRACT(year from grantdate) WHERE gyear = -1 AND grantdate IS NOT NULL; UPDATE 1646225
UPDATE patents SET appyear = EXTRACT(year from appdate) WHERE appyear = -1 AND appdate is not null; -- RESULT UPDATE 1646225
/* Test Script */ SELECT patentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass FROM Patents LIMIT 100;
patent | integer | not null grantdate | date | type | character varying | applicationnumber | character varying | filingdate | date | prioritydate | date | prioritycountry | character varying | prioritypatentnumber | character varying | cpcsubgroup | character varying | numberofclaims | integer | pctpatentnumber | character varying | claims | integer | appnum | integer | gyear | integer | appdate | date | appyear | integer | nber | integer | uspc | character varying | uspc_sub | character varying
/* Drop the merged columns */
ALTER TABLE patents DROP COLUMN numberofclaims, DROP COLUMN filingdate, DROP COLUMN applicationnumber, DROP COLUMN type;
UPDATE patents SET prioritycountry = NULL WHERE prioritycountry = 'NULL';
UPDATE patents SET pctpatentnumber = NULL WHERE pctpatentnumber = 'NULL';
UPDATE patents SET prioritypatentnumber = NULL WHERE prioritypatentnumber = 'NULL';
UPDATE patents SET cpcsubgroup = NULL WHERE cpcsubgroup = 'NULL';
UPDATE patents SET appnum = NULL WHERE appnum = -1;
UPDATE patents SET gyear = NULL WHERE gyear = -1;
UPDATE patents
SET appyear = NULL WHERE appyear = -1;
Results:
allpatent=# ALTER TABLE patents allpatent-# DROP COLUMN numberofclaims, allpatent-# DROP COLUMN filingdate, allpatent-# DROP COLUMN applicationnumber, allpatent-# DROP COLUMN type; ALTER TABLE allpatent=# allpatent=# allpatent=# UPDATE patents allpatent-# SET prioritycountry = NULL allpatent-# WHERE prioritycountry = 'NULL'; ^[[BUPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET pctpatentnumber = NULL allpatent-# WHERE pctpatentnumber = 'NULL'; UPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET prioritypatentnumber = NULL allpatent-# WHERE prioritypatentnumber = 'NULL'; UPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET cpcsubgroup = NULL allpatent-# WHERE cpcsubgroup = 'NULL'; UPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET appnum = NULL allpatent-# WHERE appnum = -1; UPDATE 0 allpatent=# allpatent=# UPDATE patents allpatent-# SET gyear = NULL allpatent-# WHERE gyear = -1; UPDATE 0 allpatent=# allpatent=# UPDATE patents allpatent-# SET appyear = NULL allpatent-# WHERE appyear = -1; UPDATE 0 allpatent=#
UPDATE assignees SET lastname = NULL WHERE lastname = 'null';
UPDATE assignees SET firstname = NULL WHERE firstname = 'null';
UPDATE assignees SET address = NULL WHERE address = 'null';
UPDATE assignees SET postcode = NULL WHERE postcode = 'null';
UPDATE assignees SET patentcountry = NULL WHERE patentcountry = 'null';
UPDATE assignees SET nationality2 = NULL WHERE nationality2 = 'null';
UPDATE assignees SET residence = NULL WHERE residence = 'null';
UPDATE assignees SET asgseq = NULL WHERE asgseq= -1;
UPDATE assignees SET asgtype = NULL WHERE asgtype = -1;
RESULTS:
UPDATE assignees allpatent-# SET lastname = NULL allpatent-# WHERE lastname = 'null';
UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET firstname = NULL allpatent-# WHERE firstname = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET address = NULL allpatent-# WHERE address = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET postcode = NULL allpatent-# WHERE postcode = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET patentcountry = NULL allpatent-# WHERE patentcountry = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET nationality2 = NULL allpatent-# WHERE nationality2 = 'null'; UPDATE 1607714 allpatent=# allpatent=# UPDATE assignees allpatent-# SET residence = NULL allpatent-# WHERE residence = 'null'; UPDATE 1607714 allpatent=# allpatent=# UPDATE assignees allpatent-# SET asgseq = NULL allpatent-# WHERE asgseq= -1; UPDATE 1607714 allpatent=# allpatent=# UPDATE assignees allpatent-# SET asgtype = NULL allpatent-# WHERE asgtype = -1; UPDATE 1607714 allpatent=#
CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser;
Renaming Tables and Columns
To standardize table and column names, and to make them as user-friendly as possible, a few tables and columns have been renamed.
- allpatent database -> patent
- assignees -> assignee
- judges -> judge
- citations -> citation
- matchassignees -> MatchOrgNames
- patents -> patent
- assignees -> ptoassignee
- assignments -> ptoassignment
- assignors -> ptoassignor
- patentassignment -> ptopatentfile
- properties -> ptoproperty
- mslfee -> feestatus
- patentmaintenancefee -> fee