Converting SS-DB data into NetCDF format
A challenge in designing systems for scientific data analysis is a lack of representative data sets and queries. In the world of relational database systems, the TPC benchmarks serve as a common tool for comparing performance. However, there has been little work done in producing benchmarks representative of scientific data analysis workloads. One such solution is the SS-DB benchmark. From the Science Benchmark (SS-DB) website:
SS-DB is representative of the processing performed in a number of scientific domains in addition to astronomy, including earth science, oceanography, and medical image analysis.
The SS-DB website links to the data generator tool used to produce SS-DB data sets, which are generated in a raw binary format. Systems such as SciHadoop are designed to process NetCDF data using Hadoop. The following tool converts a raw SS-DB data set to a NetCDF file that can be used in existing tools, such as SciHadoop or the NetCDF Operator Suite.
Building the tool requires the NetCDF development libraries. Example build and usage:
gcc -Wall -o tool ssdb_nc3_loader.c -lnetcdf
usage: ./tool [-c] -i <img path> -z <z-dim idx> -s <img size> -n <out.nc>
Here is the code for the ssdb_nc3_loader.c
tool:
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <netcdf.h>
/* each pixel in an image has 11 attributes */
#define PIXEL_ATTRS 11
struct pixel {
int attrs[PIXEL_ATTRS];
};
static void
check_err(const int stat, const int line, const char *file) {
if (stat != NC_NOERR) {
(void)fprintf(stderr,"line %d of %s: %s\n", line, file, nc_strerror(stat));
fflush(stderr);
exit(1);
}
}
static int
create_ncfile(char *ncpath, int side) {
int stat; /* return status */
int ncid; /* netCDF id */
/* dimension ids */
int z_dim;
int x_dim;
int y_dim;
/* dimension lengths */
size_t z_len = NC_UNLIMITED;
size_t x_len = side;
size_t y_len = side;
/* variable ids */
int a_id;
int b_id;
int c_id;
int d_id;
int e_id;
int f_id;
int g_id;
int h_id;
int i_id;
int j_id;
int k_id;
/* rank (number of dimensions) for each variable */
# define RANK_a 3
# define RANK_b 3
# define RANK_c 3
# define RANK_d 3
# define RANK_e 3
# define RANK_f 3
# define RANK_g 3
# define RANK_h 3
# define RANK_i 3
# define RANK_j 3
# define RANK_k 3
/* variable shapes */
int a_dims[RANK_a];
int b_dims[RANK_b];
int c_dims[RANK_c];
int d_dims[RANK_d];
int e_dims[RANK_e];
int f_dims[RANK_f];
int g_dims[RANK_g];
int h_dims[RANK_h];
int i_dims[RANK_i];
int j_dims[RANK_j];
int k_dims[RANK_k];
/* enter define mode */
stat = nc_create(ncpath, NC_NOCLOBBER|NC_64BIT_OFFSET, &ncid);
check_err(stat,__LINE__,__FILE__);
/* define dimensions */
stat = nc_def_dim(ncid, "z", z_len, &z_dim);
check_err(stat,__LINE__,__FILE__);
stat = nc_def_dim(ncid, "x", x_len, &x_dim);
check_err(stat,__LINE__,__FILE__);
stat = nc_def_dim(ncid, "y", y_len, &y_dim);
check_err(stat,__LINE__,__FILE__);
/* define variables */
a_dims[0] = z_dim;
a_dims[1] = x_dim;
a_dims[2] = y_dim;
stat = nc_def_var(ncid, "a", NC_INT, RANK_a, a_dims, &a_id);
check_err(stat,__LINE__,__FILE__);
b_dims[0] = z_dim;
b_dims[1] = x_dim;
b_dims[2] = y_dim;
stat = nc_def_var(ncid, "b", NC_INT, RANK_b, b_dims, &b_id);
check_err(stat,__LINE__,__FILE__);
c_dims[0] = z_dim;
c_dims[1] = x_dim;
c_dims[2] = y_dim;
stat = nc_def_var(ncid, "c", NC_INT, RANK_c, c_dims, &c_id);
check_err(stat,__LINE__,__FILE__);
d_dims[0] = z_dim;
d_dims[1] = x_dim;
d_dims[2] = y_dim;
stat = nc_def_var(ncid, "d", NC_INT, RANK_d, d_dims, &d_id);
check_err(stat,__LINE__,__FILE__);
e_dims[0] = z_dim;
e_dims[1] = x_dim;
e_dims[2] = y_dim;
stat = nc_def_var(ncid, "e", NC_INT, RANK_e, e_dims, &e_id);
check_err(stat,__LINE__,__FILE__);
f_dims[0] = z_dim;
f_dims[1] = x_dim;
f_dims[2] = y_dim;
stat = nc_def_var(ncid, "f", NC_INT, RANK_f, f_dims, &f_id);
check_err(stat,__LINE__,__FILE__);
g_dims[0] = z_dim;
g_dims[1] = x_dim;
g_dims[2] = y_dim;
stat = nc_def_var(ncid, "g", NC_INT, RANK_g, g_dims, &g_id);
check_err(stat,__LINE__,__FILE__);
h_dims[0] = z_dim;
h_dims[1] = x_dim;
h_dims[2] = y_dim;
stat = nc_def_var(ncid, "h", NC_INT, RANK_h, h_dims, &h_id);
check_err(stat,__LINE__,__FILE__);
i_dims[0] = z_dim;
i_dims[1] = x_dim;
i_dims[2] = y_dim;
stat = nc_def_var(ncid, "i", NC_INT, RANK_i, i_dims, &i_id);
check_err(stat,__LINE__,__FILE__);
j_dims[0] = z_dim;
j_dims[1] = x_dim;
j_dims[2] = y_dim;
stat = nc_def_var(ncid, "j", NC_INT, RANK_j, j_dims, &j_id);
check_err(stat,__LINE__,__FILE__);
k_dims[0] = z_dim;
k_dims[1] = x_dim;
k_dims[2] = y_dim;
stat = nc_def_var(ncid, "k", NC_INT, RANK_k, k_dims, &k_id);
check_err(stat,__LINE__,__FILE__);
/* leave define mode */
stat = nc_enddef (ncid);
check_err(stat,__LINE__,__FILE__);
/* assign variable data */
stat = nc_close(ncid);
check_err(stat,__LINE__,__FILE__);
return 0;
}
static int do_load_image(int ncid, int z_idx, int side, struct pixel *img)
{
int i, x, y;
int *attr;
char *vnames[PIXEL_ATTRS] = {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"};
size_t pixel_idx;
int stat;
int varid;
size_t start[3] = {z_idx, 0, 0};
size_t count[3] = {1, side, side};
attr = malloc(side * side * sizeof(*attr));
if (!attr) {
perror("malloc");
exit(1);
}
for (i = 0; i < PIXEL_ATTRS; i++) {
pixel_idx = 0;
for (x = 0; x < side; x++) {
for (y = 0; y < side; y++) {
attr[pixel_idx] = img[pixel_idx].attrs[i];
pixel_idx++;
}
}
stat = nc_inq_varid(ncid, vnames[i], &varid);
check_err(stat, __LINE__, __FILE__);
stat = nc_put_vara_int(ncid, varid, start, count, attr);
check_err(stat, __LINE__, __FILE__);
}
return 0;
}
static int load_image(char *ncpath, char *imgpath, int z_idx, off_t side)
{
int stat;
int ncid;
struct stat st;
int imgfd;
int dimid;
size_t dimlen;
struct pixel *img;
/* the netcdf file we will be writing into */
stat = nc_open(ncpath, NC_WRITE, &ncid);
check_err(stat, __LINE__, __FILE__);
/* open the image being loaded */
imgfd = open(imgpath, O_RDONLY);
if (imgfd < 0) {
perror("open");
exit(1);
}
/* get the image file size */
stat = fstat(imgfd, &st);
if (stat) {
perror("fstat");
exit(1);
}
/* check if the image size is correct given the expected image side len */
if (st.st_size != (side * side * sizeof(struct pixel))) {
fprintf(stderr, "err: not enough pixels in image\n");
fprintf(stderr, " - img size = %llu\n", (unsigned long long)st.st_size);
fprintf(stderr, " - expected = %llu\n",
(unsigned long long)(side*side*sizeof(struct pixel)));
exit(1);
}
/* check if the netcdf file has x dim length = side */
stat = nc_inq_dimid(ncid, "x", &dimid);
check_err(stat, __LINE__, __FILE__);
stat = nc_inq_dimlen(ncid, dimid, &dimlen);
check_err(stat, __LINE__, __FILE__);
if (dimlen != side) {
fprintf(stderr, "err: dimlen(x) = %d != %d (expected)\n",
(int)dimlen, (int)side);
exit(1);
}
/* check if the netcdf file has y dim length = side */
stat = nc_inq_dimid(ncid, "y", &dimid);
check_err(stat, __LINE__, __FILE__);
stat = nc_inq_dimlen(ncid, dimid, &dimlen);
check_err(stat, __LINE__, __FILE__);
if (dimlen != side) {
fprintf(stderr, "err: dimlen(y) = %d != %d (expected)\n",
(int)dimlen, (int)side);
exit(1);
}
/*
* sanity check overkill
* - this z-index should be the first in
* increasing order
*/
stat = nc_inq_dimid(ncid, "z", &dimid);
check_err(stat, __LINE__, __FILE__);
stat = nc_inq_dimlen(ncid, dimid, &dimlen);
check_err(stat, __LINE__, __FILE__);
if (dimlen != z_idx) {
fprintf(stderr, "err: dimlen(z) = %d != %d (expected)\n",
(int)dimlen, (int)z_idx);
exit(1);
}
/*
* memory map the image!
*/
img = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, imgfd, 0);
if (img == MAP_FAILED) {
perror("mmap");
exit(1);
}
/* do it */
do_load_image(ncid, z_idx, side, img);
munmap(img, st.st_size);
close(imgfd);
stat = nc_close(ncid);
check_err(stat,__LINE__,__FILE__);
return 0;
}
int main(int argc, char **argv)
{
char c;
int do_create = 0;
int side = -1;
int z_idx = -1;
char *imgpath = NULL;
char *ncpath = NULL;
/*
* -c: do create
* -s: length of image side
* -z: index in z-dim to load
* -i: image path
* -n: netcdf file
*/
while ((c = getopt(argc, argv, "cs:z:i:n:")) != -1) {
switch (c) {
case 'c':
do_create = 1;
break;
case 's':
side = atoi(optarg);
break;
case 'z':
z_idx = atoi(optarg);
break;
case 'i':
imgpath = strdup(optarg);
break;
case 'n':
ncpath = strdup(optarg);
break;
default:
fprintf(stderr, "usage: [-c] -i <img> -z <ts> -s <len> -n <nc>\n");
exit(1);
}
}
/* expected length of each side of the image */
if (side < 0) {
fprintf(stderr, "-s <side length> is required.\n");
exit(1);
}
/* the position in the z-dim to load the image */
if (z_idx < 0) {
fprintf(stderr, "-z <time step> is required.\n");
exit(1);
}
/* path to the image */
if (!imgpath) {
fprintf(stderr, "-i <image path> is required.\n");
exit(1);
}
/* path to the target netcdf file */
if (!ncpath) {
fprintf(stderr, "-n <netcdf file> is required.\n");
exit(1);
}
/* throws error if file already exists */
if (do_create)
create_ncfile(ncpath, side);
/* load image. expects file to exist */
load_image(ncpath, imgpath, z_idx, side);
return 0;
}