Skip to main content

Converting SS-DB data into NetCDF format

·7 mins

A challenge in designing systems for scientific data analysis is a lack of representative data sets and queries. In the world of relational database systems, the TPC benchmarks serve as a common tool for comparing performance. However, there has been little work done in producing benchmarks representative of scientific data analysis workloads. One such solution is the SS-DB benchmark. From the Science Benchmark (SS-DB) website:

SS-DB is representative of the processing performed in a number of scientific domains in addition to astronomy, including earth science, oceanography, and medical image analysis.

The SS-DB website links to the data generator tool used to produce SS-DB data sets, which are generated in a raw binary format. Systems such as SciHadoop are designed to process NetCDF data using Hadoop. The following tool converts a raw SS-DB data set to a NetCDF file that can be used in existing tools, such as SciHadoop or the NetCDF Operator Suite.

Building the tool requires the NetCDF development libraries. Example build and usage:

gcc -Wall -o tool ssdb_nc3_loader.c -lnetcdf
usage: ./tool [-c] -i <img path> -z <z-dim idx> -s <img size> -n <out.nc>

Here is the code for the ssdb_nc3_loader.c tool:

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <netcdf.h>

/* each pixel in an image has 11 attributes */
#define PIXEL_ATTRS 11
struct pixel {
    int attrs[PIXEL_ATTRS];
};

static void
check_err(const int stat, const int line, const char *file) {
    if (stat != NC_NOERR) {
        (void)fprintf(stderr,"line %d of %s: %s\n", line, file, nc_strerror(stat));
        fflush(stderr);
        exit(1);
    }
}

static int
create_ncfile(char *ncpath, int side) {

    int  stat;  /* return status */
    int  ncid;  /* netCDF id */

    /* dimension ids */
    int z_dim;
    int x_dim;
    int y_dim;

    /* dimension lengths */
    size_t z_len = NC_UNLIMITED;
    size_t x_len = side;
    size_t y_len = side;

    /* variable ids */
    int a_id;
    int b_id;
    int c_id;
    int d_id;
    int e_id;
    int f_id;
    int g_id;
    int h_id;
    int i_id;
    int j_id;
    int k_id;

    /* rank (number of dimensions) for each variable */
#   define RANK_a 3
#   define RANK_b 3
#   define RANK_c 3
#   define RANK_d 3
#   define RANK_e 3
#   define RANK_f 3
#   define RANK_g 3
#   define RANK_h 3
#   define RANK_i 3
#   define RANK_j 3
#   define RANK_k 3

    /* variable shapes */
    int a_dims[RANK_a];
    int b_dims[RANK_b];
    int c_dims[RANK_c];
    int d_dims[RANK_d];
    int e_dims[RANK_e];
    int f_dims[RANK_f];
    int g_dims[RANK_g];
    int h_dims[RANK_h];
    int i_dims[RANK_i];
    int j_dims[RANK_j];
    int k_dims[RANK_k];

    /* enter define mode */
    stat = nc_create(ncpath, NC_NOCLOBBER|NC_64BIT_OFFSET, &ncid);
    check_err(stat,__LINE__,__FILE__);

    /* define dimensions */
    stat = nc_def_dim(ncid, "z", z_len, &z_dim);
    check_err(stat,__LINE__,__FILE__);
    stat = nc_def_dim(ncid, "x", x_len, &x_dim);
    check_err(stat,__LINE__,__FILE__);
    stat = nc_def_dim(ncid, "y", y_len, &y_dim);
    check_err(stat,__LINE__,__FILE__);

    /* define variables */

    a_dims[0] = z_dim;
    a_dims[1] = x_dim;
    a_dims[2] = y_dim;
    stat = nc_def_var(ncid, "a", NC_INT, RANK_a, a_dims, &a_id);
    check_err(stat,__LINE__,__FILE__);

    b_dims[0] = z_dim;
    b_dims[1] = x_dim;
    b_dims[2] = y_dim;
    stat = nc_def_var(ncid, "b", NC_INT, RANK_b, b_dims, &b_id);
    check_err(stat,__LINE__,__FILE__);

    c_dims[0] = z_dim;
    c_dims[1] = x_dim;
    c_dims[2] = y_dim;
    stat = nc_def_var(ncid, "c", NC_INT, RANK_c, c_dims, &c_id);
    check_err(stat,__LINE__,__FILE__);

    d_dims[0] = z_dim;
    d_dims[1] = x_dim;
    d_dims[2] = y_dim;
    stat = nc_def_var(ncid, "d", NC_INT, RANK_d, d_dims, &d_id);
    check_err(stat,__LINE__,__FILE__);

    e_dims[0] = z_dim;
    e_dims[1] = x_dim;
    e_dims[2] = y_dim;
    stat = nc_def_var(ncid, "e", NC_INT, RANK_e, e_dims, &e_id);
    check_err(stat,__LINE__,__FILE__);

    f_dims[0] = z_dim;
    f_dims[1] = x_dim;
    f_dims[2] = y_dim;
    stat = nc_def_var(ncid, "f", NC_INT, RANK_f, f_dims, &f_id);
    check_err(stat,__LINE__,__FILE__);

    g_dims[0] = z_dim;
    g_dims[1] = x_dim;
    g_dims[2] = y_dim;
    stat = nc_def_var(ncid, "g", NC_INT, RANK_g, g_dims, &g_id);
    check_err(stat,__LINE__,__FILE__);

    h_dims[0] = z_dim;
    h_dims[1] = x_dim;
    h_dims[2] = y_dim;
    stat = nc_def_var(ncid, "h", NC_INT, RANK_h, h_dims, &h_id);
    check_err(stat,__LINE__,__FILE__);

    i_dims[0] = z_dim;
    i_dims[1] = x_dim;
    i_dims[2] = y_dim;
    stat = nc_def_var(ncid, "i", NC_INT, RANK_i, i_dims, &i_id);
    check_err(stat,__LINE__,__FILE__);

    j_dims[0] = z_dim;
    j_dims[1] = x_dim;
    j_dims[2] = y_dim;
    stat = nc_def_var(ncid, "j", NC_INT, RANK_j, j_dims, &j_id);
    check_err(stat,__LINE__,__FILE__);

    k_dims[0] = z_dim;
    k_dims[1] = x_dim;
    k_dims[2] = y_dim;
    stat = nc_def_var(ncid, "k", NC_INT, RANK_k, k_dims, &k_id);
    check_err(stat,__LINE__,__FILE__);

    /* leave define mode */
    stat = nc_enddef (ncid);
    check_err(stat,__LINE__,__FILE__);

    /* assign variable data */

    stat = nc_close(ncid);
    check_err(stat,__LINE__,__FILE__);
    return 0;
}

static int do_load_image(int ncid, int z_idx, int side, struct pixel *img)
{
    int i, x, y;
    int *attr;
    char *vnames[PIXEL_ATTRS] = {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"};
    size_t pixel_idx;
    int stat;
    int varid;
    size_t start[3] = {z_idx, 0, 0};
    size_t count[3] = {1, side, side};

    attr = malloc(side * side * sizeof(*attr));
    if (!attr) {
        perror("malloc");
        exit(1);
    }

    for (i = 0; i < PIXEL_ATTRS; i++) {

        pixel_idx = 0;
        for (x = 0; x < side; x++) {
            for (y = 0; y < side; y++) {
                attr[pixel_idx] = img[pixel_idx].attrs[i];
                pixel_idx++;
            }
        }

        stat = nc_inq_varid(ncid, vnames[i], &varid);
        check_err(stat, __LINE__, __FILE__);

        stat = nc_put_vara_int(ncid, varid, start, count, attr);
        check_err(stat, __LINE__, __FILE__);
    }

    return 0;
}

static int load_image(char *ncpath, char *imgpath, int z_idx, off_t side)
{
    int stat;
    int ncid;
    struct stat st;
    int imgfd;
    int dimid;
    size_t dimlen;
    struct pixel *img;

    /* the netcdf file we will be writing into */
    stat = nc_open(ncpath, NC_WRITE, &ncid);
    check_err(stat, __LINE__, __FILE__);

    /* open the image being loaded */
    imgfd = open(imgpath, O_RDONLY);
    if (imgfd < 0) {
        perror("open");
        exit(1);
    }

    /* get the image file size */
    stat = fstat(imgfd, &st);
    if (stat) {
        perror("fstat");
        exit(1);
    }

    /* check if the image size is correct given the expected image side len */
    if (st.st_size != (side * side * sizeof(struct pixel))) {
        fprintf(stderr, "err: not enough pixels in image\n");
        fprintf(stderr, "  - img size = %llu\n", (unsigned long long)st.st_size);
        fprintf(stderr, "  - expected = %llu\n",
                (unsigned long long)(side*side*sizeof(struct pixel)));
        exit(1);
    }

    /* check if the netcdf file has x dim length = side */

    stat = nc_inq_dimid(ncid, "x", &dimid);
    check_err(stat, __LINE__, __FILE__);

    stat = nc_inq_dimlen(ncid, dimid, &dimlen);
    check_err(stat, __LINE__, __FILE__);

    if (dimlen != side) {
        fprintf(stderr, "err: dimlen(x) = %d != %d (expected)\n",
                (int)dimlen, (int)side);
        exit(1);
    }

    /* check if the netcdf file has y dim length = side */

    stat = nc_inq_dimid(ncid, "y", &dimid);
    check_err(stat, __LINE__, __FILE__);

    stat = nc_inq_dimlen(ncid, dimid, &dimlen);
    check_err(stat, __LINE__, __FILE__);

    if (dimlen != side) {
        fprintf(stderr, "err: dimlen(y) = %d != %d (expected)\n",
                (int)dimlen, (int)side);
        exit(1);
    }

    /*
     * sanity check overkill
     *   - this z-index should be the first in
     *     increasing order
     */
    stat = nc_inq_dimid(ncid, "z", &dimid);
    check_err(stat, __LINE__, __FILE__);

    stat = nc_inq_dimlen(ncid, dimid, &dimlen);
    check_err(stat, __LINE__, __FILE__);

    if (dimlen != z_idx) {
        fprintf(stderr, "err: dimlen(z) = %d != %d (expected)\n",
                (int)dimlen, (int)z_idx);
        exit(1);
    }

    /*
     * memory map the image!
     */
    img = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, imgfd, 0);
    if (img == MAP_FAILED) {
        perror("mmap");
        exit(1);
    }

    /* do it */
    do_load_image(ncid, z_idx, side, img);

    munmap(img, st.st_size);
    close(imgfd);

    stat = nc_close(ncid);
    check_err(stat,__LINE__,__FILE__);

    return 0;
}

int main(int argc, char **argv)
{
    char c;
    int do_create = 0;
    int side = -1;
    int z_idx = -1;
    char *imgpath = NULL;
    char *ncpath = NULL;

    /*
     * -c: do create
     * -s: length of image side
     * -z: index in z-dim to load
     * -i: image path
     * -n: netcdf file
     */
    while ((c = getopt(argc, argv, "cs:z:i:n:")) != -1) {
        switch (c) {
        case 'c':
            do_create = 1;
            break;
        case 's':
            side = atoi(optarg);
            break;
        case 'z':
            z_idx = atoi(optarg);
            break;
        case 'i':
            imgpath = strdup(optarg);
            break;
        case 'n':
            ncpath = strdup(optarg);
            break;
        default:
            fprintf(stderr, "usage: [-c] -i <img> -z <ts> -s <len> -n <nc>\n");
            exit(1);
        }
    }

    /* expected length of each side of the image */
    if (side < 0) {
        fprintf(stderr, "-s <side length> is required.\n");
        exit(1);
    }

    /* the position in the z-dim to load the image */
    if (z_idx < 0) {
        fprintf(stderr, "-z <time step> is required.\n");
        exit(1);
    }

    /* path to the image */
    if (!imgpath) {
        fprintf(stderr, "-i <image path> is required.\n");
        exit(1);
    }

    /* path to the target netcdf file */
    if (!ncpath) {
        fprintf(stderr, "-n <netcdf file> is required.\n");
        exit(1);
    }

    /* throws error if file already exists */
    if (do_create)
        create_ncfile(ncpath, side);

    /* load image. expects file to exist */
    load_image(ncpath, imgpath, z_idx, side);

    return 0;
}