#!/usr/bin/env python """ Author : flo date : 2017.10.17 Purpose : pull random sample data from a large dataset. : Comments : you might need to know the size of your csv file first. : if its a csv text file, you can do this in linux, `cat data.csv | wc -l` : if your file is not of text base and not one line per record, then idk, man. : example : python randDataSet.py maxDataSize sampleSize fullPathToDataInput fullPathToSampleOutput example : python randDataSet.py 1000000 1000 /home/username/data.csv /var/tmp/output.txt """ import random as rand import sys import os def chkSanity(tmp): ssize, msize, fin, fout = 0,0,"","" if len(tmp) == 4: # Max data size if tmp[0].isdigit(): msize = int(tmp[0]) if msize <= 0: print "Value can not be less than or equal to zero." sys.exit() else: print "Value entered was not numeric." sys.exit() # Sample data size if tmp[1].isdigit(): ssize = int(tmp[1]) if ssize <= 0: print "Value can not be less than or equal to zero." sys.exit() if ssize >= msize: print "Value can not be equal or larger than the number of Max Data size." sys.exit() else: print "Value entered was not numeric." sys.exit() # file Input if os.path.exists(os.path.dirname(tmp[2])): fin = tmp[2] else: print "File system path does not exists." sys.exit() # file Output if os.path.exists(os.path.dirname(tmp[3])): fout = tmp[3] else: print "File system path does not exists." sys.exit() randdataset(msize,ssize,fin,fout) else: print "Not enough Parameters." sys.exit() def randdataset(m,s,fi,fo): rand.seed(rand.weibullvariate(1, rand.randint(1,m))) f_i = open(fi, "r+") f_o = open(fo, "w+") w,y =[],[] for i in f_i: w.append(i) x = rand.sample(w, s) for i in x: y.append(i) f_o.write(i) f_i.close() f_o.close() def main(): s = sys.argv[1:] chkSanity(s) # main if __name__ == "__main__": main()