I had to retrieve individual records based on dn from a 500MB LDIF file and using traditional python LDIF libraries using the LDIFParser took around 90 seconds or so. The issue is that this is a 'sparse' lookup problem. I didn't really need to know about 99% of the records, just around 100-1000 records that were anywhere in the LDIF file.
I figured that it would be preferable to index the entire LDIF quickly or as quickly as possible and then use a getDN function to retrieve entries individually. Of course you can get pretty clever with this and extend it to do some more interesting indexing etc.
Stats of the LDIF file I used:
I figured that it would be preferable to index the entire LDIF quickly or as quickly as possible and then use a getDN function to retrieve entries individually. Of course you can get pretty clever with this and extend it to do some more interesting indexing etc.
Stats of the LDIF file I used:
size: 448,355kB
Lines: 16,337,077
Entries: 389,776
And it was able to index the file in:Lines: 16,337,077
Entries: 389,776
Time to index: 14,763ms
Time to retrieve a 1000 entries using the getDN: 1,855ms
Total memory: ~45MB
Using the LDIFParser and storing dns in a map for easy retrieval:
Time to retrieve a 1000 entries using the getDN: 1,855ms
Total memory: ~45MB
Using the LDIFParser and storing dns in a map for easy retrieval:
Time to index: 93,628ms
Time to retrieve a 1000 entries using the map: 1,432ms
Total memory: ~1.5GB
Time to retrieve a 1000 entries using the map: 1,432ms
Total memory: ~1.5GB
class largeLDIF(ldif.LDIFParser):
"""indexes through large ldifs to only store dn vs file.seek number as a map
retrieval is done on demand afterwards"""
dnMap=OrderedDict()
count=0
def __init__(self,input_file):
if isinstance(input_file, str):
self.ldif_if=open(input_file,'rb')
elif hasattr(input_file, "read"):
self.ldif_if=input_file
ldif.LDIFParser.__init__(self,self.ldif_if)
self.dnMap={}
def parse(self):
"""override parse to only look for dns to generate a map"""
loc=self.ldif_if.tell()
self._line = self.ldif_if.readline()
while self._line:
colon_pos=self._line.find(':')
if self._line[0]=='#' or colon_pos<0:
loc=self.ldif_if.tell()
self._line = self.ldif_if.readline()
continue
attr_type = self._line[0:colon_pos]
if attr_type == 'dn':
# check if line is folded, do this only if a dn was found
unfolded_lines=[self._stripLineSep(self._line)]
self._line=self.ldif_if.readline()
# check for fold and don't process if not needed
if self._line and self._line[0]==' ':
while self._line and self._line[0]==' ':
unfolded_lines.append(self._stripLineSep(self._line))
self._line=self.ldif_if.readline()
unfoldeddn=''.join(unfolded_lines)
else:
unfoldeddn=unfolded_lines[0]
value_spec = unfoldeddn[colon_pos:colon_pos+2]
if value_spec=='::':
# attribute value needs base64-decoding
attr_value = base64.decodestring(unfoldeddn[colon_pos+2:])
elif value_spec==':\r\n' or value_spec=='\n':
# no use if dn is empty
continue
else:
attr_value = unfoldeddn[colon_pos+2:].lstrip()
self.dnMap[attr_value]=loc
self.count+=1
if self.count % 10000==0:
sys.stderr.write("%s - %s\n"%(self.count,attr_value))
# now read until an empty line or end of file is found
# since that would indicate end of record
while self._line and not ( self._line == '\r\n' or self._line=='\n') :
self._line = self.ldif_if.readline()
loc=self.ldif_if.tell()
self._line=self.ldif_if.readline()
def getDN(self,dn):
if self.dnMap.has_key(dn):
self.ldif_if.seek(self.dnMap[dn])
#read the entry block into a stringIO object
ldifStr=StringIO.StringIO()
line=self.ldif_if.readline()
while line:
if not line or line == '\r\n' or line=='\n':
ldifStr.write(line)
break
else:
ldifStr.write(line)
line=self.ldif_if.readline()
ldifStr.seek(0)
# record is read in
rec=ldif.LDIFRecordList(ldifStr)
rec.parse()
return rec.all_records[0]
So, I couldn't help it.. I tested a simple file read to see what is the fastest it could read through the file without doing any operations. Just plain read a 500MB file.
Well, that turned out to be around 2 second mark.
Next I tried adding one or two if-then-elses to see how it affected the performance and I got to around 7seconds.
So I rewrote the class with fewest branching and operational steps I could as shown below:
Important things to note is to use the for loop with the file object improves the performance signficantly.
Also, to optimize checking for 'dn' I found that startswith takes an enormous amount of time in comparison to using a slice comparison (see this post:http://stackoverflow.com/questions/13270888/why-is-startswith-slower-than-slicing).
class largeLDIF(ldif.LDIFParser):
"""indexes through large ldifs to only store dn vs line number as a map
retrieval is done on demand afterwards"""
dnMap=OrderedDict()
count=0
_loc=0L
_newloc=0L
def __init__(self,input_file):
if isinstance(input_file, str):
self.ldif_if=open(input_file,'rb')
elif hasattr(input_file, "read"):
self.ldif_if=input_file
ldif.LDIFParser.__init__(self,self.ldif_if)
self.dnMap={}
def parse(self):
"""override parse to only look for dns to generate a map"""
loc=0L
newloc=0L
# prime this
unfolded_lines=None
unfoldcheck=False
dnloc=0L
readuntilEndofRec =False
colon_pos=2
for self._line in self.ldif_if:
loc=newloc
newloc+=len(self._line)
if readuntilEndofRec:
#skip records until it matches return character
if self._line == "\r\n" or self._line == "\n":
readuntilEndofRec=False
continue
else:
continue
if unfoldcheck:
if self._line[0]==' ' :
#folded line detected
unfolded_lines.append(self._stripLineSep(self._line[1:]))
continue
else:
unfoldeddn=''.join(unfolded_lines)
value_spec = unfoldeddn[colon_pos:colon_pos+2]
if value_spec=='::':
# attribute value needs base64-decoding
attr_value = base64.decodestring(unfoldeddn[colon_pos+2:])
elif value_spec==':\r\n' or value_spec=='\n':
# no. use if dn is empty
unfoldcheck=False
continue
else:
attr_value = unfoldeddn[colon_pos+2:].lstrip()
self.dnMap[attr_value.lower()]=dnloc
if self.count % 10000==0:
sys.stderr.write("%s - %s\n"%(self.count,attr_value))
unfoldcheck=False
readuntilEndofRec=True
continue
attr_spec=self._line[:3]
if attr_spec=='dn:':
unfolded_lines=[self._stripLineSep(self._line)]
unfoldcheck=True
dnloc=loc
self.count+=1
def getDN(self,dn):
if self.dnMap.has_key(dn.lower()):
self.ldif_if.seek(self.dnMap[dn.lower()])
#read the entry block into a stringIO object
ldifStr=StringIO.StringIO()
for line in self.ldif_if:
ldifStr.write(line)
if line == '\r\n' or line=='\n':
break
ldifStr.seek(0)
# record is read in
rec=ldif.LDIFRecordList(ldifStr)
rec.parse()
return rec.all_records[0]
Well, here are the results:Time to index: 10,515ms
Time to retrieve a 1000 entries using the getDN: 342ms
Total memory: ~50MB
So we shaved another 25% off the time and improved performance!
Also, interesting to note is that I get a solid disk IO at 50MB/s in the initial indexing step (which makes sense as rough math translates to 50MB x 10seconds=500MB total so it adds up). I however don't believe this is a diskIO bottleneck since a raw read using python without checking anything is around the 2sec mark, possibly with buffering etc the perceived disk speed is around 250MB/s (and I don't have an SSD in this test so obviously this has to be some OS buffering etc).