Difference between revisions of "Patent Data Cleanup (June 2016)"
Jump to navigation
Jump to search
(→Script) |
|||
Line 279: | Line 279: | ||
CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser; | CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser; | ||
+ | |||
+ | == Renaming Tables and Columns == | ||
+ | |||
+ | To standardize table and column names, and to make them as user-friendly as possible, a few tables and columns have been renamed. | ||
+ | * '''allpatent''' database -> '''patent''' | ||
+ | * assignees -> assignee | ||
+ | * judges -> judge | ||
+ | * citations -> citation | ||
+ | * matchassignees -> MatchOrgNames | ||
+ | * patents -> patent | ||
+ | * assignees -> ptoassignee | ||
+ | * assignments -> ptoassignment | ||
+ | * assignors -> ptoassignor | ||
+ | * patentassignment -> ptopatentfile | ||
+ | * properties -> ptoproperty | ||
+ | * mslfee -> feestatus | ||
+ | * patentmaintenancefee -> fee |
Revision as of 10:45, 7 July 2016
About this Page
This page contains the script that was used to clean up the patents and assignees tables in allpatent.
Script
ALTER TABLE patents RENAME COLUMN patentnumber TO patent;
ALTER TABLE patents DROP COLUMN kind, DROP COLUMN title, DROP COLUMN ussubclass, ** DROP COLUMN maingroup, -- DROP COLUMN subgroup,-- DROP COLUMN cpcsubclass, ++ DROP COLUMN cpcmaingroup, ++ DROP COLUMN classificationnationalcountry, DROP COLUMN classificationnationalclass,** (?) DROP COLUMN primaryexaminerfirstname, DROP COLUMN primaryexaminerlastname, DROP COLUMN primaryexaminerdepartment, DROP COLUMN filename;
UPDATE patents SET type = '2015' WHERE type != 'NULL';
-- RESULT : UPDATE 1646225
UPDATE patents SET type = '2010' WHERE type = 'NULL'; -- RESULT : UPDATE 3764926
/* Join the historical patent data from the US PTO with the patents table */
ALTER TABLE PATENTS ADD COLUMN nber INT, ADD COLUMN uspc varchar, ADD COLUMN uspc_sub varchar;
UPDATE patents p SET nber = hp.nber, uspc = hp.uspc, uspc_sub = hp.uspc FROM historicalpatentdata hp WHERE hp.patentnumber = CAST(p.patent AS varchar);
-- RESULT : UPDATE 5113655
/* Mergeing some columns - claims and number of claims - column name : claims*/ UPDATE patents SET claims = numberofclaims WHERE claims = -1; -- RESULTS : UPDATE 1646225
/* Merging columns - UPDATE patents SET appnum = CAST (applicationnumber AS INT) where appnum = -1; -- RESULT : UPDATE 1646225
UPDATE patents SET appdate = filingdate where appdate = '0001-01-01 BC' OR filingdate is not NULL;
-- RESULT UPDATE 1646225
ALTER TABLE patents DROP COLUMN apptype;
/* Generating GYear and AppYear from the dates */ UPDATE patents SET gyear = EXTRACT(year from grantdate) WHERE gyear = -1 AND grantdate IS NOT NULL; UPDATE 1646225
UPDATE patents SET appyear = EXTRACT(year from appdate) WHERE appyear = -1 AND appdate is not null; -- RESULT UPDATE 1646225
/* Test Script */ SELECT patentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass FROM Patents LIMIT 100;
patent | integer | not null grantdate | date | type | character varying | applicationnumber | character varying | filingdate | date | prioritydate | date | prioritycountry | character varying | prioritypatentnumber | character varying | cpcsubgroup | character varying | numberofclaims | integer | pctpatentnumber | character varying | claims | integer | appnum | integer | gyear | integer | appdate | date | appyear | integer | nber | integer | uspc | character varying | uspc_sub | character varying
/* Drop the merged columns */
ALTER TABLE patents DROP COLUMN numberofclaims, DROP COLUMN filingdate, DROP COLUMN applicationnumber, DROP COLUMN type;
UPDATE patents SET prioritycountry = NULL WHERE prioritycountry = 'NULL';
UPDATE patents SET pctpatentnumber = NULL WHERE pctpatentnumber = 'NULL';
UPDATE patents SET prioritypatentnumber = NULL WHERE prioritypatentnumber = 'NULL';
UPDATE patents SET cpcsubgroup = NULL WHERE cpcsubgroup = 'NULL';
UPDATE patents SET appnum = NULL WHERE appnum = -1;
UPDATE patents SET gyear = NULL WHERE gyear = -1;
UPDATE patents
SET appyear = NULL WHERE appyear = -1;
Results:
allpatent=# ALTER TABLE patents allpatent-# DROP COLUMN numberofclaims, allpatent-# DROP COLUMN filingdate, allpatent-# DROP COLUMN applicationnumber, allpatent-# DROP COLUMN type; ALTER TABLE allpatent=# allpatent=# allpatent=# UPDATE patents allpatent-# SET prioritycountry = NULL allpatent-# WHERE prioritycountry = 'NULL'; ^[[BUPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET pctpatentnumber = NULL allpatent-# WHERE pctpatentnumber = 'NULL'; UPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET prioritypatentnumber = NULL allpatent-# WHERE prioritypatentnumber = 'NULL'; UPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET cpcsubgroup = NULL allpatent-# WHERE cpcsubgroup = 'NULL'; UPDATE 3764926 allpatent=# allpatent=# UPDATE patents allpatent-# SET appnum = NULL allpatent-# WHERE appnum = -1; UPDATE 0 allpatent=# allpatent=# UPDATE patents allpatent-# SET gyear = NULL allpatent-# WHERE gyear = -1; UPDATE 0 allpatent=# allpatent=# UPDATE patents allpatent-# SET appyear = NULL allpatent-# WHERE appyear = -1; UPDATE 0 allpatent=#
UPDATE assignees SET lastname = NULL WHERE lastname = 'null';
UPDATE assignees SET firstname = NULL WHERE firstname = 'null';
UPDATE assignees SET address = NULL WHERE address = 'null';
UPDATE assignees SET postcode = NULL WHERE postcode = 'null';
UPDATE assignees SET patentcountry = NULL WHERE patentcountry = 'null';
UPDATE assignees SET nationality2 = NULL WHERE nationality2 = 'null';
UPDATE assignees SET residence = NULL WHERE residence = 'null';
UPDATE assignees SET asgseq = NULL WHERE asgseq= -1;
UPDATE assignees SET asgtype = NULL WHERE asgtype = -1;
RESULTS:
UPDATE assignees allpatent-# SET lastname = NULL allpatent-# WHERE lastname = 'null';
UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET firstname = NULL allpatent-# WHERE firstname = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET address = NULL allpatent-# WHERE address = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET postcode = NULL allpatent-# WHERE postcode = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET patentcountry = NULL allpatent-# WHERE patentcountry = 'null'; UPDATE 3818842 allpatent=# allpatent=# UPDATE assignees allpatent-# SET nationality2 = NULL allpatent-# WHERE nationality2 = 'null'; UPDATE 1607714 allpatent=# allpatent=# UPDATE assignees allpatent-# SET residence = NULL allpatent-# WHERE residence = 'null'; UPDATE 1607714 allpatent=# allpatent=# UPDATE assignees allpatent-# SET asgseq = NULL allpatent-# WHERE asgseq= -1; UPDATE 1607714 allpatent=# allpatent=# UPDATE assignees allpatent-# SET asgtype = NULL allpatent-# WHERE asgtype = -1; UPDATE 1607714 allpatent=#
CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser;
Renaming Tables and Columns
To standardize table and column names, and to make them as user-friendly as possible, a few tables and columns have been renamed.
- allpatent database -> patent
- assignees -> assignee
- judges -> judge
- citations -> citation
- matchassignees -> MatchOrgNames
- patents -> patent
- assignees -> ptoassignee
- assignments -> ptoassignment
- assignors -> ptoassignor
- patentassignment -> ptopatentfile
- properties -> ptoproperty
- mslfee -> feestatus
- patentmaintenancefee -> fee