I have a very huge(20GB) csv file .
I am asked to parse the file in a most effieient manner so that it has the best performance.

Based on the first column in the csv ,I have to invoke a async restapi url from which I dont need any responses

Can you please suggest how I should do


Please find attached here a samle code which I tried

First method:
I split the huge files into small chunks using shell and then called the python program by passing the file name as arg []


shell code

#! /bin/bash
#python pass_args.py outputfile.logaj

split -l 10000 LogServer.log outputfile.log

for f in $(find '/home/' -name 'outputfile.*'); do

echo $f
var=$(echo $f | awk -F"/" '{print $4}')
echo 'the file name is ' $var

python test.py $var &
done


import sys, argparse, csv, collections, time, datetime,urllib2

EventAPIDict = {"event": "http://ap/pgr/?",
"character_logout": "http://ap/pgr/pgr/?"}

EventParamsDict = {"event": ("s", "ts"),
"character_logout": ("s","ts")}

EventIndexDict = {"event": ("8", "4"),
"character_logout": ("8", "4")}


#def processRow(row):
#print "-----------------------------"
# print row
# event = row[0]

def convertFile(fileName):
#print fileName
with open(fileName, 'rb') as csvfile:
csvfile.seek(0)
reader = csv.reader(csvfile, delimiter=',')

for row in reader:
#processRow(row)
#print row
event = row[0]
if event in EventAPIDict:
strRESTAPI = EventAPIDict[event]
i = 0
for param in EventParamsDict[event]:
indexes = EventIndexDict[event]
value = row[int(indexes[i])]
if param == "ts":
timestamp = value #"2013-05-28 14:28:58"
value = datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S').strftime("%s")
strRESTAPI = ("%s%s=%s&")%(strRESTAPI, param, value)
#print strRESTAPI
i = i + 1
strRESTAPI = strRESTAPI[:-1]
#url='http://ap/pgr//?s=85465&ts=1370591808'
url=strRESTAPI
print url
urllib2.urlopen(url)

if __name__ == '__main__':
filename = sys.argv[1]
convertFile(filename)



Python


Second method I used map_reduce architecture .
#mincemeat.py -p bulk 127.0.0.1
#!/usr/bin/env python
import mincemeat
import csv
import operator
import collections
import string
#import time
#import datetime
#import urllib2



# Define map function
def map_bigdata(k, v):
print "---- bigdata-------"

yield v

# Define reduce function
def reduce_bigdata(k, ones):

import urllib2

EventAPIDict = {"event_name": "http:/url/pgr/?",
"event_name": "http://url/pgr/?"}

EventParamsDict = {"event_name": ("s", "ts"),
"event_name": ("s","ts")}

EventIndexDict = {"event_name": ("8", "4"),
"event_name": ("8", "4")}


print "--------------"
for dummy in ones:
row = dummy.split(',')
#print localRow
event = row[0]
if event in EventAPIDict:
#print event
strRESTAPI = EventAPIDict[event]
i = 0
for param in EventParamsDict[event]:
#print param
indexes = EventIndexDict[event]
value = row[int(indexes[i])]
if param == "ts":
timestamp = value #"2013-05-28 14:28:58"
#value = datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S').strftime("%s")
value = '1371644793'
strRESTAPI = ("%s%s=%s&")%(strRESTAPI, param, value)
i = i + 1

strRESTAPI = strRESTAPI[:-1]
print strRESTAPI
import urllib2
urllib2.urlopen(strRESTAPI)


return 1

allrows = csv.reader(open('LogServer.log', 'rb'), delimiter=',')
# Define dictionary placeholder for source data
source_data = {}
for index,row in enumerate(allrows):
#print "-------------------------"
# Generate dictionary, with row key index and value from city.state and country column
import string
b = string.join (row, ',')
#print b
source_data[index] = [row[0],b]

print "Initial done"
s = mincemeat.Server()

s.datasource = source_data
s.mapfn = map_bigdata
s.reducefn = reduce_bigdata

results = s.run_server(password="bulk")




But for 16 MB file it takes around 10 minutes .Can some one please help me