In my last post I wrote about how I was able to improve protobuf deserialization using a C++ extension. I’ve boiled down the code to its essence to show how I did it. Rather than zip everything up in a file, the code is short enough to show in its entirety.

Here’s the simplified protobuf message which is used to represent a time series as 2 arrays:

[sourcecode language=”python”]

package timeseries;

message TimeSeries {

repeated double times = 2;

repeated double values = 3;

}

[/sourcecode]

I then wrote a test app in Python and C++ to provide a benchmark. Here is the Python version:

[sourcecode language=”python”]

import numpy

import time_series_pb2

def write_test():

ts = time_series_pb2.TimeSeries()

for i in range(10000000):

ts.times.append(i)

ts.values.append(i*10.0)

import time

start = time.time();

f = open("ts.bin", "wb")

f.write(ts.SerializeToString())

f.close()

print time.time() – start

def read_test():

ts = time_series_pb2.TimeSeries()

import time

start = time.time();

f = open("ts.bin", "rb")

ts.ParseFromString(f.read())

f.close()

t = numpy.array(ts.times._values)

v = numpy.array(ts.values._values)

print ‘Read time:’, time.time() – start

print "Read time series of length %d" % len(ts.times)

if __name__ == "__main__":

import sys

if len(sys.argv) < 2:

print "usage: %s <–read> | <–write>" % sys.argv[0]

if sys.argv[1] == "–read":

read_test()

else:

write_test()

[/sourcecode]

I will spare you the C++ standalone code, since it was only a stepping stone. Instead here is the C++ extension, with 2 exposed methods, one which deserializes a string and the other which operates on a file.

[sourcecode language=”cpp”]

#include <fcntl.h>

#include <Python.h>

#include <numpy/arrayobject.h>

#include <google/protobuf/io/coded_stream.h>

#include <google/protobuf/io/zero_copy_stream_impl_lite.h>

#include <google/protobuf/io/zero_copy_stream_impl.h>

#include "time_series.pb.h"

static PyObject* construct_numpy_arrays(timeseries::TimeSeries* ts)

{

// returns a tuple (t,v) where t and v are double arrays of the same length

PyObject* data_tuple = PyTuple_New(2);

long array_size = ts->times_size();

double* times = new double[array_size];

double* values = new double[array_size];

// the data must be copied because the tsid will go away and its mutable data

// will too

memcpy(times, ts->times().data(), ts->times_size()*sizeof(double));

memcpy(values, ts->values().data(), ts->values_size()*sizeof(double));

// put the arrays into numpy array objects

npy_intp dims[1] = {array_size};

PyObject* time_array = PyArray_SimpleNewFromData(1, dims, PyArray_DOUBLE, times);

PyObject* value_array = PyArray_SimpleNewFromData(1, dims, PyArray_DOUBLE, values);

PyTuple_SetItem(data_tuple, 0, time_array);

PyTuple_SetItem(data_tuple, 1, value_array);

return data_tuple;

}

static PyObject* TimeSeries_load(PyObject* self, PyObject* args)

{

char* filename = NULL;

if (! PyArg_ParseTuple(args, "s", &filename))

{

return NULL;

}

timeseries::TimeSeries ts;

int fd = open(filename, O_RDONLY);

google::protobuf::io::FileInputStream fs(fd);

google::protobuf::io::CodedInputStream coded_fs(&fs);

coded_fs.SetTotalBytesLimit(500*1024*1024, -1);

ts.ParseFromCodedStream(&coded_fs);

fs.Close();

close(fd);

return construct_numpy_arrays(&ts);

}

static PyObject* TimeSeries_deserialize(PyObject* self, PyObject* args)

{

int buffer_length;

char* serialization = NULL;

if (! PyArg_ParseTuple(args, "t#", &serialization, &buffer_length))

{

return NULL;

}

google::protobuf::io::ArrayInputStream input(serialization, buffer_length);

google::protobuf::io::CodedInputStream coded_fs(&input);

coded_fs.SetTotalBytesLimit(500*1024*1024, -1);

timeseries::TimeSeries ts;

ts.ParseFromCodedStream(&coded_fs);

return construct_numpy_arrays(&ts);

}

static PyMethodDef TSMethods[] = {

{"load", TimeSeries_load, METH_VARARGS, "loads a TimeSeries from a file"},

{"deserialize", TimeSeries_deserialize, METH_VARARGS, "loads a TimeSeries from a string"}

};

#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */

#define PyMODINIT_FUNC void

#endif

PyMODINIT_FUNC inittimeseries(void)

{

import_array();

(void) Py_InitModule("timeseries", TSMethods);

}

[/sourcecode]

Calling the exension from python is trivial:

[sourcecode language=”python”]

import time

import timeseries

start = time.time()

t, v = timeseries.load(‘ts.bin’)

print "read and converted to numpy array in %f" % (time.time()-start)

print "timeseries contained %d values" % len(v)

[/sourcecode]