Difference between revisions of "Patent Data Cleanup (June 2016)"

From edegan.com
Jump to navigation Jump to search
(Created page with "== About this Page == This page contains the script that was used to clean up the patents and assignees tables in allpatent. == Script ==")
 
 
(12 intermediate revisions by 4 users not shown)
Line 1: Line 1:
 +
{{Project
 +
|Has project output=Data
 +
|Has sponsor=McNair Center
 +
|Has title=Patent Data Cleanup - June 2016
 +
|Has owner=Marcela Interiano,
 +
|Has project status=Subsume
 +
|Has keywords=Data
 +
}}
 
== About this Page ==
 
== About this Page ==
  
 
This page contains the script that was used to clean up the patents and assignees tables in allpatent.
 
This page contains the script that was used to clean up the patents and assignees tables in allpatent.
 +
 +
Cleaning up includes:
 +
* Cleaning 'NULL' string and -1 inserts : at the time of merging the patentdata and patent_2015 databases, I inserted 'NULL' strings and -1 in integer columns to differentiate between NULLs that came from the vendor, and 'NULL's that I inserted because of no column overlap.
 +
** The 'NULL's got replaced with NULL
 +
** The -1s got replaced with NULL as well.
 +
 +
* Merging some more columns, and dropping unnecessary columns:
 +
** At the time of merging the tables, some columns, particularly in the patent table, were not merged as they should have been.
 +
** The script that follows merges those columns as well.
 +
NOTE: The patent data page detailing the SQL steps followed to merge the data now has the updated table structures. The script on this page can be used as a reference when trying to debug any (unlikely) merging errors
 +
 +
* Renaming tables and columns
 +
** Table names and column names have been standardized.
 +
** General rule of thumb is : short column names, singular table names (for example : patent and not patents)
  
 
== Script ==
 
== Script ==
 +
ALTER TABLE patents
 +
RENAME COLUMN patentnumber TO patent;
 +
 +
ALTER TABLE patents
 +
  DROP COLUMN kind,
 +
  DROP COLUMN title,
 +
  DROP COLUMN ussubclass, **
 +
  DROP COLUMN maingroup, --
 +
  DROP COLUMN subgroup,--
 +
  DROP COLUMN cpcsubclass, ++
 +
  DROP COLUMN cpcmaingroup, ++
 +
  DROP COLUMN classificationnationalcountry,
 +
  DROP COLUMN classificationnationalclass,** (?)
 +
  DROP COLUMN primaryexaminerfirstname,
 +
  DROP COLUMN primaryexaminerlastname,
 +
  DROP COLUMN primaryexaminerdepartment,
 +
  DROP COLUMN filename;
 +
 +
UPDATE patents
 +
  SET type = '2015'
 +
  WHERE type != 'NULL';
 +
 +
-- RESULT : UPDATE 1646225
 +
 +
UPDATE patents
 +
  SET type = '2010'
 +
  WHERE type = 'NULL';
 +
-- RESULT : UPDATE 3764926
 +
 +
/* Join the historical patent data from the US PTO with the patents table */
 +
 +
ALTER TABLE PATENTS
 +
  ADD COLUMN nber INT,
 +
  ADD COLUMN uspc varchar,
 +
  ADD COLUMN uspc_sub varchar;
 +
 +
UPDATE patents p
 +
  SET nber = hp.nber,
 +
  uspc = hp.uspc,
 +
  uspc_sub = hp.uspc
 +
  FROM historicalpatentdata hp
 +
  WHERE hp.patentnumber = CAST(p.patent AS varchar);
 +
 +
-- RESULT : UPDATE 5113655
 +
 +
/* Mergeing some columns - claims and number of claims - column name : claims*/
 +
UPDATE patents
 +
  SET claims = numberofclaims
 +
  WHERE claims = -1;
 +
-- RESULTS : UPDATE 1646225
 +
 +
/* Merging columns -
 +
UPDATE patents
 +
  SET appnum = CAST (applicationnumber AS INT)
 +
  where appnum = -1;
 +
-- RESULT : UPDATE 1646225
 +
 +
UPDATE patents
 +
  SET appdate = filingdate
 +
  where appdate = '0001-01-01 BC'
 +
  OR filingdate is not NULL;
 +
 +
-- RESULT UPDATE 1646225
 +
 +
ALTER TABLE patents
 +
  DROP COLUMN apptype;
 +
 +
/* Generating GYear and AppYear from the dates */
 +
UPDATE patents
 +
  SET gyear = EXTRACT(year from grantdate)
 +
  WHERE gyear = -1
 +
  AND grantdate IS NOT NULL;
 +
UPDATE 1646225
 +
 +
UPDATE patents
 +
  SET appyear = EXTRACT(year from appdate)
 +
  WHERE appyear = -1
 +
  AND appdate is not null;
 +
-- RESULT UPDATE 1646225
 +
 +
/* Test Script */
 +
SELECT patentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass FROM Patents LIMIT 100;
 +
 +
patent              | integer          | not null
 +
  grantdate            | date              |
 +
  type                | character varying |
 +
  applicationnumber    | character varying |
 +
  filingdate          | date              |
 +
  prioritydate        | date              |
 +
  prioritycountry      | character varying |
 +
  prioritypatentnumber | character varying |
 +
  cpcsubgroup          | character varying |
 +
  numberofclaims      | integer          |
 +
  pctpatentnumber      | character varying |
 +
  claims              | integer          |
 +
  appnum              | integer          |
 +
  gyear                | integer          |
 +
  appdate              | date              |
 +
  appyear              | integer          |
 +
  nber                | integer          |
 +
  uspc                | character varying |
 +
  uspc_sub            | character varying
 +
 +
/* Drop the merged columns */
 +
 +
ALTER TABLE patents
 +
  DROP COLUMN numberofclaims,
 +
  DROP COLUMN filingdate,
 +
  DROP COLUMN applicationnumber,
 +
  DROP COLUMN type;
 +
 +
 +
UPDATE patents
 +
  SET prioritycountry = NULL
 +
  WHERE prioritycountry = 'NULL';
 +
 +
UPDATE patents
 +
  SET pctpatentnumber = NULL
 +
  WHERE pctpatentnumber = 'NULL';
 +
 +
UPDATE patents
 +
  SET prioritypatentnumber = NULL
 +
  WHERE prioritypatentnumber = 'NULL';
 +
 +
UPDATE patents
 +
  SET cpcsubgroup = NULL
 +
  WHERE cpcsubgroup = 'NULL';
 +
 +
UPDATE patents
 +
  SET appnum = NULL
 +
  WHERE appnum = -1;
 +
 +
UPDATE patents
 +
  SET gyear = NULL
 +
  WHERE gyear = -1;
 +
 +
UPDATE patents
 +
 +
  SET appyear = NULL
 +
  WHERE appyear = -1;
 +
 +
Results:
 +
 +
allpatent=# ALTER TABLE patents
 +
allpatent-# DROP COLUMN numberofclaims,
 +
allpatent-# DROP COLUMN filingdate,
 +
allpatent-# DROP COLUMN applicationnumber,
 +
allpatent-# DROP COLUMN type;
 +
ALTER TABLE
 +
allpatent=#
 +
allpatent=#
 +
allpatent=# UPDATE patents
 +
allpatent-# SET prioritycountry = NULL
 +
allpatent-# WHERE prioritycountry = 'NULL';
 +
^[[BUPDATE 3764926
 +
allpatent=#
 +
allpatent=# UPDATE patents
 +
allpatent-# SET pctpatentnumber = NULL
 +
allpatent-# WHERE pctpatentnumber = 'NULL';
 +
UPDATE 3764926
 +
allpatent=#
 +
allpatent=# UPDATE patents
 +
allpatent-# SET prioritypatentnumber = NULL
 +
allpatent-# WHERE prioritypatentnumber = 'NULL';
 +
UPDATE 3764926
 +
allpatent=#
 +
allpatent=# UPDATE patents
 +
allpatent-# SET cpcsubgroup = NULL
 +
allpatent-# WHERE cpcsubgroup = 'NULL';
 +
UPDATE 3764926
 +
allpatent=#
 +
allpatent=# UPDATE patents
 +
allpatent-# SET appnum = NULL
 +
allpatent-# WHERE appnum = -1;
 +
UPDATE 0
 +
allpatent=#
 +
allpatent=# UPDATE patents
 +
allpatent-# SET gyear = NULL
 +
allpatent-# WHERE gyear = -1;
 +
UPDATE 0
 +
allpatent=#
 +
allpatent=# UPDATE patents
 +
allpatent-# SET appyear = NULL
 +
allpatent-# WHERE appyear = -1;
 +
UPDATE 0
 +
allpatent=#
 +
 +
 +
UPDATE assignees
 +
  SET lastname = NULL
 +
  WHERE lastname = 'null';
 +
 +
UPDATE assignees
 +
  SET firstname = NULL
 +
  WHERE firstname = 'null';
 +
 +
UPDATE assignees
 +
  SET address = NULL
 +
  WHERE address = 'null';
 +
 +
UPDATE assignees
 +
  SET postcode = NULL
 +
  WHERE  postcode = 'null';
 +
 +
UPDATE assignees
 +
  SET patentcountry = NULL
 +
  WHERE patentcountry = 'null';
 +
 +
UPDATE assignees
 +
  SET nationality2 = NULL
 +
  WHERE nationality2 = 'null';
 +
 +
UPDATE assignees
 +
  SET residence = NULL
 +
  WHERE residence = 'null';
 +
 +
UPDATE assignees
 +
  SET asgseq = NULL
 +
  WHERE asgseq= -1;
 +
 +
UPDATE assignees
 +
  SET asgtype = NULL
 +
  WHERE asgtype = -1;
 +
 +
RESULTS:
 +
 +
UPDATE assignees
 +
allpatent-# SET lastname = NULL
 +
allpatent-# WHERE lastname = 'null';
 +
 +
UPDATE 3818842
 +
allpatent=#
 +
allpatent=# UPDATE assignees
 +
allpatent-# SET firstname = NULL
 +
allpatent-# WHERE firstname = 'null';
 +
UPDATE 3818842
 +
allpatent=#
 +
allpatent=# UPDATE assignees
 +
allpatent-# SET address = NULL
 +
allpatent-# WHERE address = 'null';
 +
UPDATE 3818842
 +
allpatent=#
 +
allpatent=# UPDATE assignees
 +
allpatent-# SET postcode = NULL
 +
allpatent-# WHERE  postcode = 'null';
 +
UPDATE 3818842
 +
allpatent=#
 +
allpatent=# UPDATE assignees
 +
allpatent-# SET patentcountry = NULL
 +
allpatent-# WHERE patentcountry = 'null';
 +
UPDATE 3818842
 +
allpatent=#
 +
allpatent=# UPDATE assignees
 +
allpatent-# SET nationality2 = NULL
 +
allpatent-# WHERE nationality2 = 'null';
 +
UPDATE 1607714
 +
allpatent=#
 +
allpatent=# UPDATE assignees
 +
allpatent-# SET residence = NULL
 +
allpatent-# WHERE residence = 'null';
 +
UPDATE 1607714
 +
allpatent=#
 +
allpatent=# UPDATE assignees
 +
allpatent-# SET asgseq = NULL
 +
allpatent-# WHERE asgseq= -1;
 +
UPDATE 1607714
 +
allpatent=#
 +
allpatent=# UPDATE assignees
 +
allpatent-# SET asgtype = NULL
 +
allpatent-# WHERE asgtype = -1;
 +
UPDATE 1607714
 +
allpatent=#
 +
 +
 +
CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser;
 +
 +
== Renaming Tables and Columns ==
 +
 +
To standardize table and column names, and to make them as user-friendly as possible, a few tables and columns have been renamed.
 +
* '''allpatent''' database -> '''patent'''
 +
* assignees -> assignee
 +
* judges -> judge
 +
* citations -> citation
 +
* matchassignees -> MatchOrgNames
 +
* patents -> patent
 +
* assignees -> ptoassignee
 +
* assignments -> ptoassignment
 +
* assignors -> ptoassignor
 +
* patentassignment -> ptopatentfile
 +
* properties -> ptoproperty
 +
* mslfee -> feestatus
 +
* patentmaintenancefee -> fee

Latest revision as of 12:41, 21 September 2020


Project
Patent Data Cleanup (June 2016)
Project logo 02.png
Project Information
Has title Patent Data Cleanup - June 2016
Has owner Marcela Interiano
Has start date
Has deadline date
Has keywords Data
Has project status Subsume
Subsumed by: Patent Assignment Data Restructure
Has sponsor McNair Center
Has project output Data
Copyright © 2019 edegan.com. All Rights Reserved.

About this Page

This page contains the script that was used to clean up the patents and assignees tables in allpatent.

Cleaning up includes:

  • Cleaning 'NULL' string and -1 inserts : at the time of merging the patentdata and patent_2015 databases, I inserted 'NULL' strings and -1 in integer columns to differentiate between NULLs that came from the vendor, and 'NULL's that I inserted because of no column overlap.
    • The 'NULL's got replaced with NULL
    • The -1s got replaced with NULL as well.
  • Merging some more columns, and dropping unnecessary columns:
    • At the time of merging the tables, some columns, particularly in the patent table, were not merged as they should have been.
    • The script that follows merges those columns as well.
NOTE: The patent data page detailing the SQL steps followed to merge the data now has the updated table structures. The script on this page can be used as a reference when trying to debug any (unlikely) merging errors
  • Renaming tables and columns
    • Table names and column names have been standardized.
    • General rule of thumb is : short column names, singular table names (for example : patent and not patents)

Script

ALTER TABLE patents
RENAME COLUMN patentnumber TO patent;
ALTER TABLE patents
 DROP COLUMN kind,
 DROP COLUMN title,
 DROP COLUMN ussubclass, **
 DROP COLUMN maingroup, --
 DROP COLUMN subgroup,--
 DROP COLUMN cpcsubclass, ++
 DROP COLUMN cpcmaingroup, ++
 DROP COLUMN classificationnationalcountry,
 DROP COLUMN classificationnationalclass,** (?)
 DROP COLUMN primaryexaminerfirstname,
 DROP COLUMN primaryexaminerlastname,
 DROP COLUMN primaryexaminerdepartment,
 DROP COLUMN filename;
UPDATE patents
 SET type = '2015'
 WHERE type != 'NULL';
-- RESULT : UPDATE 1646225
UPDATE patents
 SET type = '2010'
 WHERE type = 'NULL';
-- RESULT : UPDATE 3764926
/* Join the historical patent data from the US PTO with the patents table */
ALTER TABLE PATENTS
 ADD COLUMN nber INT,
 ADD COLUMN uspc varchar,
 ADD COLUMN uspc_sub varchar;
UPDATE patents p
 SET nber = hp.nber,
 uspc = hp.uspc,
 uspc_sub = hp.uspc
  FROM historicalpatentdata hp
  WHERE hp.patentnumber = CAST(p.patent AS varchar);
-- RESULT : UPDATE 5113655
/* Mergeing some columns - claims and number of claims - column name : claims*/
UPDATE patents
 SET claims = numberofclaims
 WHERE claims = -1;
-- RESULTS : UPDATE 1646225
/* Merging columns - 
UPDATE patents
 SET appnum = CAST (applicationnumber AS INT)
 where appnum = -1;
-- RESULT : UPDATE 1646225
UPDATE patents
 SET appdate = filingdate
 where appdate = '0001-01-01 BC'
 OR filingdate is not NULL;
-- RESULT UPDATE 1646225
ALTER TABLE patents
 DROP COLUMN apptype;
/* Generating GYear and AppYear from the dates */
UPDATE patents
 SET gyear = EXTRACT(year from grantdate)
 WHERE gyear = -1
 AND grantdate IS NOT NULL;
UPDATE 1646225
UPDATE patents
 SET appyear = EXTRACT(year from appdate)
 WHERE appyear = -1 
 AND appdate is not null;
-- RESULT UPDATE 1646225
/* Test Script */
SELECT patentnumber, ussubclass, maingroup, subgroup, cpcsubclass, cpcmaingroup, cpcsubgroup, classificationnationalcountry, classificationnationalclass FROM Patents LIMIT 100;
patent               | integer           | not null
 grantdate            | date              |
 type                 | character varying |
 applicationnumber    | character varying |
 filingdate           | date              |
 prioritydate         | date              |
 prioritycountry      | character varying |
 prioritypatentnumber | character varying |
 cpcsubgroup          | character varying |
 numberofclaims       | integer           |
 pctpatentnumber      | character varying |
 claims               | integer           |
 appnum               | integer           |
 gyear                | integer           |
 appdate              | date              |
 appyear              | integer           |
 nber                 | integer           |
 uspc                 | character varying |
 uspc_sub             | character varying 
/* Drop the merged columns */
ALTER TABLE patents
 DROP COLUMN numberofclaims,
 DROP COLUMN filingdate,
 DROP COLUMN applicationnumber,
 DROP COLUMN type;


UPDATE patents
 SET prioritycountry = NULL
 WHERE prioritycountry = 'NULL';
UPDATE patents
 SET pctpatentnumber = NULL
 WHERE pctpatentnumber = 'NULL';
UPDATE patents
 SET prioritypatentnumber = NULL
 WHERE prioritypatentnumber = 'NULL';
UPDATE patents
 SET cpcsubgroup = NULL
 WHERE cpcsubgroup = 'NULL';
UPDATE patents
 SET appnum = NULL
 WHERE appnum = -1;
UPDATE patents
 SET gyear = NULL
 WHERE gyear = -1;
UPDATE patents
 SET appyear = NULL
 WHERE appyear = -1;
Results:
allpatent=# ALTER TABLE patents
allpatent-# DROP COLUMN numberofclaims,
allpatent-# DROP COLUMN filingdate,
allpatent-# DROP COLUMN applicationnumber,
allpatent-# DROP COLUMN type;
ALTER TABLE
allpatent=#
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET prioritycountry = NULL
allpatent-# WHERE prioritycountry = 'NULL';
^[[BUPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET pctpatentnumber = NULL
allpatent-# WHERE pctpatentnumber = 'NULL';
UPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET prioritypatentnumber = NULL
allpatent-# WHERE prioritypatentnumber = 'NULL';
UPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET cpcsubgroup = NULL
allpatent-# WHERE cpcsubgroup = 'NULL';
UPDATE 3764926
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET appnum = NULL
allpatent-# WHERE appnum = -1;
UPDATE 0
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET gyear = NULL
allpatent-# WHERE gyear = -1;
UPDATE 0
allpatent=#
allpatent=# UPDATE patents
allpatent-# SET appyear = NULL
allpatent-# WHERE appyear = -1;
UPDATE 0
allpatent=#


UPDATE assignees
 SET lastname = NULL
 WHERE lastname = 'null';
UPDATE assignees
 SET firstname = NULL
 WHERE firstname = 'null';
UPDATE assignees
 SET address = NULL
 WHERE address = 'null';
UPDATE assignees
 SET postcode = NULL
 WHERE   postcode = 'null';
UPDATE assignees
 SET patentcountry = NULL
 WHERE patentcountry = 'null';
UPDATE assignees
 SET nationality2 = NULL
 WHERE nationality2 = 'null';
UPDATE assignees
 SET residence = NULL
 WHERE residence = 'null';
UPDATE assignees
 SET asgseq = NULL
 WHERE asgseq= -1;
UPDATE assignees
 SET asgtype = NULL
 WHERE asgtype = -1;
RESULTS:
UPDATE assignees
allpatent-# SET lastname = NULL
allpatent-# WHERE lastname = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET firstname = NULL
allpatent-# WHERE firstname = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET address = NULL
allpatent-# WHERE address = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET postcode = NULL
allpatent-# WHERE   postcode = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET patentcountry = NULL
allpatent-# WHERE patentcountry = 'null';
UPDATE 3818842
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET nationality2 = NULL
allpatent-# WHERE nationality2 = 'null';
UPDATE 1607714
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET residence = NULL
allpatent-# WHERE residence = 'null';
UPDATE 1607714
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET asgseq = NULL
allpatent-# WHERE asgseq= -1;
UPDATE 1607714
allpatent=#
allpatent=# UPDATE assignees
allpatent-# SET asgtype = NULL
allpatent-# WHERE asgtype = -1;
UPDATE 1607714
allpatent=#


CREATE DATABASE allpatent_clone WITH TEMPLATE allpatent OWNER dbuser;

Renaming Tables and Columns

To standardize table and column names, and to make them as user-friendly as possible, a few tables and columns have been renamed.

  • allpatent database -> patent
  • assignees -> assignee
  • judges -> judge
  • citations -> citation
  • matchassignees -> MatchOrgNames
  • patents -> patent
  • assignees -> ptoassignee
  • assignments -> ptoassignment
  • assignors -> ptoassignor
  • patentassignment -> ptopatentfile
  • properties -> ptoproperty
  • mslfee -> feestatus
  • patentmaintenancefee -> fee