#!/usr/bin/env python """ Author : flo date : 2015-12-31 purpose : download this zip file from cms.gov in code then extract one particular text file : from this ICD 10 package and process the text file into SQL statements : to then be inserted into a mysql DB container. download : 2016-Code-Descriptions-in-Tabular-Order.zip process : icd10cm_order_2016 run : python icd10.py comments : i am using pycharm editor community edition, there are some yellow underline warning : which i have no clue as to why it is underline. : missing 22 records due to some initial extra spacing from the extracted text file. """ from __future__ import print_function import base64 import sys import os import re import StringIO import zipfile try: import mysql.connector from mysql.connector import errorcode except ImportError as err: print(err) print("Missing Mysql Connector plugin.") print("Please install Mysql Connector module.") exit(1) try: import requests except ImportError as err: print(err) print("Missing requests plugin: Please Install requests module") exit(1) def printauthor(): author = "YXNjaGVuYmFjaEBnbWFpbC5jb20=" print(base64.b64decode(author)) def createicd10schema(): DB_NAME = 'icd10' icd10Tables = {} icd10Tables['icd10Code'] = ( "CREATE TABLE IF NOT EXISTS `icd10Code` (" " `ID` INT(11) NOT NULL AUTO_INCREMENT," " `icd10OrderID` VARCHAR(10) NULL DEFAULT NULL," " `icd10Codes` VARCHAR(10) NULL DEFAULT NULL," " `icd10HeaderCode` VARCHAR(1) NULL DEFAULT NULL," " `icd10ShortDesc` VARCHAR(250) NULL DEFAULT NULL," " `icd10LongDesc` VARCHAR(250) NULL DEFAULT NULL," " PRIMARY KEY (`ID`)" ") ENGINE=InnoDB") config = {'user': 'USERNAME', 'password': 'YOURPASSWORD', 'host': 'MYSQLServerHostName', 'database': 'mysql', 'raise_on_warnings': True} conn = mysql.connector.connect(**config) cur = conn.cursor(buffered=True) try: cur.execute("CREATE DATABASE {} DEFAULT CHARACTER SET 'latin1'".format(DB_NAME)) except mysql.connector.Error as err: print("Failed Creating Database: {}".format(err)) exit(1) try: conn.database = DB_NAME except mysql.connector.Error as err: if err.errno == errorcode.ER_BAD_DB_ERROR: createicd10schema() conn.database = DB_NAME else: print(err) exit(1) for name, ddl in icd10Tables.iteritems(): try: print("Creating table {}: ".format(name), end='') cur.execute(ddl) except mysql.connector.Error as err: if err.errno == errorcode.ER_TABLE_EXISTS_ERROR: print("Already Exists.") else: print(err.msg) print("Tables and Data inserted.") conn.commit() cur.close() conn.close() def getdata(): sess = requests.Session() url = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-Code-Descriptions-in-Tabular-Order.zip' data = sess.get(url, verify=True) if data.ok: # DUMP Internet Data to local fs if sys.platform == "win32": z = zipfile.ZipFile(StringIO.StringIO(data.content)) z.extractall() z.close() else: z = zipfile.ZipFile(StringIO.StringIO(data.content)) z.extractall() z.close() else: print("This ICD10 CM code file doesnt seem to be intact during this download process. " + url) exit(1) sess.close() def parsedata(): icdfile = 'icd10cm_order_2016.txt' f = open(icdfile, 'r') data = [] icd10list = [] icd10code = [] icd10final = [] result = [] sql, tmp1 = "", "" count = 1 # processing atoms from a pseudo-randomized state for i in f: data.append(i) # adding additional atoms into atoms for line in data: line = re.sub(r"\'", "\\'", line) line = re.sub(r"\"", "\\\"", line) icd10list.append(line) # Splitting Atoms for a in icd10list: line = a[:5] + " " + a[6:14] + " " + a[14:15] + " " + a[16:77] + " " + a[77:] icd10final.append(line) # Splitting smaller Atoms for a in icd10final: cl = re.findall(r"\S+(?:\s\S+)*", a) icd10code.append(cl) # combining Atoms into one giant one for b in icd10code: if len(b) == 5: for c in b: tmp1 = tmp1 + '\'' + c + '\'' + ',' tmp1 = tmp1.rstrip(',') sql = 'INSERT INTO `icd10Code` VALUES ( ' + str(count) + ',' + tmp1 + ');' + '\n' result.append(sql) # very important be needing this line below tmp1, c, sql = "", "", "" count += 1 # There are actually 22 rows of data where this initial text files had # additional spacing where i wasnt aware until checking the final # nnumber with the starting number of rows. Sadly, it doesnt match. # no solution on how to take care of these additional 22 rows of data # at the moment, but will try to fix it soon. # there are almost 92000 rows of data from icd10cm_order_2016.txt # additional spacings were in this file after extraction. # would need to combine some items in the two short desc and two long # desc. return result def injectdatatodb(data): config = {'user': 'USERNAME', 'password': 'YOURPASSWORD', 'host': 'MYSQLServerHostName', 'database': 'icd10', 'raise_on_warnings': True} conn = mysql.connector.connect(**config) cur = conn.cursor(buffered=True) for sql in data: cur.execute(sql) conn.commit() cur.close() conn.close() return data def writetofile(icddata): icdfile = "icd10SQLData.sql" if sys.platform == "win32": path = os.getcwd() path = path + "\\" + icdfile f = open(path, 'a') for i in icddata: f.write(i) else: path = os.getcwd() path = path + "/" + icdfile f = open(path, 'a') for i in icddata: f.write(i) f.close() def main(): createicd10schema() print() print("Downloading data file from hhs.gov.") getdata() print("Done Downloading Data file from hhs.gov.") print() print("Parsing Data file to escape single and double quotes.") b = parsedata() print("Done parsing data.") print() print("Inserting Data into Database. ") c = injectdatatodb(b) print("Done inserting Data to Database.") print() print("Saving icd10cm_order_2016.txt and icd10SQLData.sql to your computer.") writetofile(c) print("Done Saving files to your computer.") printauthor() exit(0) main()