vldb13-eth-hashjoin/no__partitioning__join_8c_source.html

#ifndef _GNU_SOURCE

#define _GNU_SOURCE

#endif

#include <sched.h>              /* CPU_ZERO, CPU_SET */

#include <pthread.h>            /* pthread_* */

#include <string.h>             /* memset */

#include <stdio.h>              /* printf */

#include <stdlib.h>             /* memalign */

#include <sys/time.h>           /* gettimeofday */


#include "no_partitioning_join.h"

#include "npj_params.h"         /* constant parameters */

#include "npj_types.h"          /* bucket_t, hashtable_t, bucket_buffer_t */

#include "rdtsc.h"              /* startTimer, stopTimer */

#include "lock.h"               /* lock, unlock */

#include "cpu_mapping.h"        /* get_cpu_id */

#ifdef PERF_COUNTERS

#include "perf_counters.h"      /* PCM_x */

#endif


#include "barrier.h"            /* pthread_barrier_* */

#include "affinity.h"           /* pthread_attr_setaffinity_np */

#include "generator.h"          /* numa_localize() */


#ifdef JOIN_RESULT_MATERIALIZE

#include "tuple_buffer.h"       /* for materialization */

#endif


#ifndef BARRIER_ARRIVE

#define BARRIER_ARRIVE(B,RV)                            \

    RV = pthread_barrier_wait(B);                       \

    if(RV !=0 && RV != PTHREAD_BARRIER_SERIAL_THREAD){  \

        printf("Couldn't wait on barrier\n");           \

        exit(EXIT_FAILURE);                             \

    }

#endif


#ifndef NEXT_POW_2

#define NEXT_POW_2(V)                           \

    do {                                        \

        V--;                                    \

        V |= V >> 1;                            \

        V |= V >> 2;                            \

        V |= V >> 4;                            \

        V |= V >> 8;                            \

        V |= V >> 16;                           \

        V++;                                    \

    } while(0)

#endif


#ifndef HASH

#define HASH(X, MASK, SKIP) (((X) & MASK) >> SKIP)

#endif


#ifdef DEBUG

#define DEBUGMSG(COND, MSG, ...)                                    \

    if(COND) { fprintf(stdout, "[DEBUG] "MSG, ## __VA_ARGS__); }

#else

#define DEBUGMSG(COND, MSG, ...)

#endif


extern int numalocalize;  /* defined in generator.c */

extern int nthreads;      /* defined in generator.c */


typedef struct arg_t arg_t;


struct arg_t {

    int32_t             tid;

    hashtable_t *       ht;

    relation_t          relR;

    relation_t          relS;

    pthread_barrier_t * barrier;

    int64_t             num_results;


    /* results of the thread */

    threadresult_t * threadresult;


#ifndef NO_TIMING

    /* stats about the thread */

    uint64_t timer1, timer2, timer3;

    struct timeval start, end;

#endif

} ;


void

init_bucket_buffer(bucket_buffer_t ** ppbuf)

{

    bucket_buffer_t * overflowbuf;

    overflowbuf = (bucket_buffer_t*) malloc(sizeof(bucket_buffer_t));

    overflowbuf->count = 0;

    overflowbuf->next  = NULL;


    *ppbuf = overflowbuf;

}


static inline void

get_new_bucket(bucket_t ** result, bucket_buffer_t ** buf)

{

    if((*buf)->count < OVERFLOW_BUF_SIZE) {

        *result = (*buf)->buf + (*buf)->count;

        (*buf)->count ++;

    }

    else {

        /* need to allocate new buffer */

        bucket_buffer_t * new_buf = (bucket_buffer_t*)

                                    malloc(sizeof(bucket_buffer_t));

        new_buf->count = 1;

        new_buf->next  = *buf;

        *buf    = new_buf;

        *result = new_buf->buf;

    }

}


void

free_bucket_buffer(bucket_buffer_t * buf)

{

    do {

        bucket_buffer_t * tmp = buf->next;

        free(buf);

        buf = tmp;

    } while(buf);

}


void

allocate_hashtable(hashtable_t ** ppht, uint32_t nbuckets)

{

    hashtable_t * ht;


    ht              = (hashtable_t*)malloc(sizeof(hashtable_t));

    ht->num_buckets = nbuckets;

    NEXT_POW_2((ht->num_buckets));


    /* allocate hashtable buckets cache line aligned */

    if (posix_memalign((void**)&ht->buckets, CACHE_LINE_SIZE,

                       ht->num_buckets * sizeof(bucket_t))){

        perror("Aligned allocation failed!\n");

        exit(EXIT_FAILURE);

    }


    if(numalocalize) {

        tuple_t * mem = (tuple_t *) ht->buckets;

        uint32_t ntuples = (ht->num_buckets*sizeof(bucket_t))/sizeof(tuple_t);

        numa_localize(mem, ntuples, nthreads);

    }


    memset(ht->buckets, 0, ht->num_buckets * sizeof(bucket_t));

    ht->skip_bits = 0; /* the default for modulo hash */

    ht->hash_mask = (ht->num_buckets - 1) << ht->skip_bits;

    *ppht = ht;

}


void

destroy_hashtable(hashtable_t * ht)

{

    free(ht->buckets);

    free(ht);

}


void

build_hashtable_st(hashtable_t *ht, relation_t *rel)

{

    uint32_t i;

    const uint32_t hashmask = ht->hash_mask;

    const uint32_t skipbits = ht->skip_bits;


    for(i=0; i < rel->num_tuples; i++){

        tuple_t * dest;

        bucket_t * curr, * nxt;

        int32_t idx = HASH(rel->tuples[i].key, hashmask, skipbits);


        /* copy the tuple to appropriate hash bucket */

        /* if full, follow nxt pointer to find correct place */

        curr = ht->buckets + idx;

        nxt  = curr->next;


        if(curr->count == BUCKET_SIZE) {

            if(!nxt || nxt->count == BUCKET_SIZE) {

                bucket_t * b;

                b = (bucket_t*) calloc(1, sizeof(bucket_t));

                curr->next = b;

                b->next = nxt;

                b->count = 1;

                dest = b->tuples;

            }

            else {

                dest = nxt->tuples + nxt->count;

                nxt->count ++;

            }

        }

        else {

            dest = curr->tuples + curr->count;

            curr->count ++;

        }

        *dest = rel->tuples[i];

    }

}


int64_t

probe_hashtable(hashtable_t *ht, relation_t *rel, void * output)

{

    uint32_t i, j;

    int64_t matches;


    const uint32_t hashmask = ht->hash_mask;

    const uint32_t skipbits = ht->skip_bits;

#ifdef PREFETCH_NPJ

    size_t prefetch_index = PREFETCH_DISTANCE;

#endif


    matches = 0;


#ifdef JOIN_RESULT_MATERIALIZE

    chainedtuplebuffer_t * chainedbuf = (chainedtuplebuffer_t *) output;

#endif


    for (i = 0; i < rel->num_tuples; i++)

    {

#ifdef PREFETCH_NPJ

        if (prefetch_index < rel->num_tuples) {

                        intkey_t idx_prefetch = HASH(rel->tuples[prefetch_index++].key,

                                         hashmask, skipbits);

                        __builtin_prefetch(ht->buckets + idx_prefetch, 0, 1);

        }

#endif


        intkey_t idx = HASH(rel->tuples[i].key, hashmask, skipbits);

        bucket_t * b = ht->buckets+idx;


        do {

            for(j = 0; j < b->count; j++) {

                if(rel->tuples[i].key == b->tuples[j].key){

                    matches ++;


 #ifdef JOIN_RESULT_MATERIALIZE

                    /* copy to the result buffer */

                    tuple_t * joinres = cb_next_writepos(chainedbuf);

                    joinres->key      = b->tuples[j].payload;   /* R-rid */

                    joinres->payload  = rel->tuples[i].payload; /* S-rid */

#endif


                }

            }


            b = b->next;/* follow overflow pointer */

        } while(b);

    }


    return matches;

}


static void

print_timing(uint64_t total, uint64_t build, uint64_t part,

            uint64_t numtuples, int64_t result,

            struct timeval * start, struct timeval * end)

{

    double diff_usec = (((*end).tv_sec*1000000L + (*end).tv_usec)

                        - ((*start).tv_sec*1000000L+(*start).tv_usec));

    double cyclestuple = total;

    cyclestuple /= numtuples;

    fprintf(stdout, "RUNTIME TOTAL, BUILD, PART (cycles): \n");

    fprintf(stderr, "%llu \t %llu \t %llu ",

            total, build, part);

    fprintf(stdout, "\n");

    fprintf(stdout, "TOTAL-TIME-USECS, TOTAL-TUPLES, CYCLES-PER-TUPLE: \n");

    fprintf(stdout, "%.4lf \t %llu \t ", diff_usec, result);

    fflush(stdout);

    fprintf(stderr, "%.4lf ", cyclestuple);

    fflush(stderr);

    fprintf(stdout, "\n");


}


result_t *

NPO_st(relation_t *relR, relation_t *relS, int nthreads)

{

    hashtable_t * ht;

    int64_t result = 0;

    result_t * joinresult;


#ifndef NO_TIMING

    struct timeval start, end;

    uint64_t timer1, timer2, timer3;

#endif

    uint32_t nbuckets = (relR->num_tuples / BUCKET_SIZE);

    allocate_hashtable(&ht, nbuckets);


    joinresult = (result_t *) malloc(sizeof(result_t));

#ifdef JOIN_RESULT_MATERIALIZE

    joinresult->resultlist = (threadresult_t *) malloc(sizeof(threadresult_t));

#endif


#ifndef NO_TIMING

    gettimeofday(&start, NULL);

    startTimer(&timer1);

    startTimer(&timer2);

    timer3 = 0; /* no partitioning */

#endif


    build_hashtable_st(ht, relR);


#ifndef NO_TIMING

    stopTimer(&timer2); /* for build */

#endif


#ifdef JOIN_RESULT_MATERIALIZE

    chainedtuplebuffer_t * chainedbuf = chainedtuplebuffer_init();

#else

    void * chainedbuf = NULL;

#endif


    result = probe_hashtable(ht, relS, chainedbuf);


#ifdef JOIN_RESULT_MATERIALIZE

    threadresult_t * thrres = &(joinresult->resultlist[0]);/* single-thread */

    thrres->nresults = result;

    thrres->threadid = 0;

    thrres->results  = (void *) chainedbuf;

#endif


#ifndef NO_TIMING

    stopTimer(&timer1); /* over all */

    gettimeofday(&end, NULL);

    /* now print the timing results: */

    print_timing(timer1, timer2, timer3, relS->num_tuples, result, &start, &end);

#endif


    destroy_hashtable(ht);


    joinresult->totalresults = result;

    joinresult->nthreads     = 1;


    return joinresult;

}


void

build_hashtable_mt(hashtable_t *ht, relation_t *rel,

                   bucket_buffer_t ** overflowbuf)

{

    uint32_t i;

    const uint32_t hashmask = ht->hash_mask;

    const uint32_t skipbits = ht->skip_bits;


#ifdef PREFETCH_NPJ

    size_t prefetch_index = PREFETCH_DISTANCE;

#endif


    for(i=0; i < rel->num_tuples; i++){

        tuple_t * dest;

        bucket_t * curr, * nxt;


#ifdef PREFETCH_NPJ

        if (prefetch_index < rel->num_tuples) {

            intkey_t idx_prefetch = HASH(rel->tuples[prefetch_index++].key,

                                         hashmask, skipbits);

                        __builtin_prefetch(ht->buckets + idx_prefetch, 1, 1);

        }

#endif


        int32_t idx = HASH(rel->tuples[i].key, hashmask, skipbits);

        /* copy the tuple to appropriate hash bucket */

        /* if full, follow nxt pointer to find correct place */

        curr = ht->buckets+idx;

        lock(&curr->latch);

        nxt = curr->next;


        if(curr->count == BUCKET_SIZE) {

            if(!nxt || nxt->count == BUCKET_SIZE) {

                bucket_t * b;

                /* b = (bucket_t*) calloc(1, sizeof(bucket_t)); */

                /* instead of calloc() everytime, we pre-allocate */

                get_new_bucket(&b, overflowbuf);

                curr->next = b;

                b->next    = nxt;

                b->count   = 1;

                dest       = b->tuples;

            }

            else {

                dest = nxt->tuples + nxt->count;

                nxt->count ++;

            }

        }

        else {

            dest = curr->tuples + curr->count;

            curr->count ++;

        }


        *dest = rel->tuples[i];

        unlock(&curr->latch);

    }


}


void *

npo_thread(void * param)

{

    int rv;

    arg_t * args = (arg_t*) param;


    /* allocate overflow buffer for each thread */

    bucket_buffer_t * overflowbuf;

    init_bucket_buffer(&overflowbuf);


#ifdef PERF_COUNTERS

    if(args->tid == 0){

        PCM_initPerformanceMonitor(NULL, NULL);

        PCM_start();

    }

#endif


    /* wait at a barrier until each thread starts and start timer */

    BARRIER_ARRIVE(args->barrier, rv);


#ifndef NO_TIMING

    /* the first thread checkpoints the start time */

    if(args->tid == 0){

        gettimeofday(&args->start, NULL);

        startTimer(&args->timer1);

        startTimer(&args->timer2);

        args->timer3 = 0; /* no partitionig phase */

    }

#endif


    /* insert tuples from the assigned part of relR to the ht */

    build_hashtable_mt(args->ht, &args->relR, &overflowbuf);


    /* wait at a barrier until each thread completes build phase */

    BARRIER_ARRIVE(args->barrier, rv);


#ifdef PERF_COUNTERS

    if(args->tid == 0){

      PCM_stop();

      PCM_log("========== Build phase profiling results ==========\n");

      PCM_printResults();

      PCM_start();

    }

    /* Just to make sure we get consistent performance numbers */

    BARRIER_ARRIVE(args->barrier, rv);

#endif


#ifndef NO_TIMING

    /* build phase finished, thread-0 checkpoints the time */

    if(args->tid == 0){

        stopTimer(&args->timer2);

    }

#endif


#ifdef JOIN_RESULT_MATERIALIZE

    chainedtuplebuffer_t * chainedbuf = chainedtuplebuffer_init();

#else

    void * chainedbuf = NULL;

#endif


    /* probe for matching tuples from the assigned part of relS */

    args->num_results = probe_hashtable(args->ht, &args->relS, chainedbuf);


#ifdef JOIN_RESULT_MATERIALIZE

    args->threadresult->nresults = args->num_results;

    args->threadresult->threadid = args->tid;

    args->threadresult->results  = (void *) chainedbuf;

#endif


#ifndef NO_TIMING

    /* for a reliable timing we have to wait until all finishes */

    BARRIER_ARRIVE(args->barrier, rv);


    /* probe phase finished, thread-0 checkpoints the time */

    if(args->tid == 0){

      stopTimer(&args->timer1);

      gettimeofday(&args->end, NULL);

    }

#endif


#ifdef PERF_COUNTERS

    if(args->tid == 0) {

        PCM_stop();

        PCM_log("========== Probe phase profiling results ==========\n");

        PCM_printResults();

        PCM_log("===================================================\n");

        PCM_cleanup();

    }

    /* Just to make sure we get consistent performance numbers */

    BARRIER_ARRIVE(args->barrier, rv);

#endif


    /* clean-up the overflow buffers */

    free_bucket_buffer(overflowbuf);


    return 0;

}


result_t *

NPO(relation_t *relR, relation_t *relS, int nthreads)

{

    hashtable_t * ht;

    int64_t result = 0;

    int32_t numR, numS, numRthr, numSthr; /* total and per thread num */

    int i, rv;

    cpu_set_t set;

    arg_t args[nthreads];

    pthread_t tid[nthreads];

    pthread_attr_t attr;

    pthread_barrier_t barrier;


    result_t * joinresult = 0;

    joinresult = (result_t *) malloc(sizeof(result_t));


#ifdef JOIN_RESULT_MATERIALIZE

    joinresult->resultlist = (threadresult_t *) malloc(sizeof(threadresult_t)

                                                       * nthreads);

#endif


    uint32_t nbuckets = (relR->num_tuples / BUCKET_SIZE);

    allocate_hashtable(&ht, nbuckets);


    numR = relR->num_tuples;

    numS = relS->num_tuples;

    numRthr = numR / nthreads;

    numSthr = numS / nthreads;


    rv = pthread_barrier_init(&barrier, NULL, nthreads);

    if(rv != 0){

        printf("Couldn't create the barrier\n");

        exit(EXIT_FAILURE);

    }


    pthread_attr_init(&attr);

    for(i = 0; i < nthreads; i++){

        int cpu_idx = get_cpu_id(i);


        DEBUGMSG(1, "Assigning thread-%d to CPU-%d\n", i, cpu_idx);


        CPU_ZERO(&set);

        CPU_SET(cpu_idx, &set);

        pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &set);


        args[i].tid = i;

        args[i].ht = ht;

        args[i].barrier = &barrier;


        /* assing part of the relR for next thread */

        args[i].relR.num_tuples = (i == (nthreads-1)) ? numR : numRthr;

        args[i].relR.tuples = relR->tuples + numRthr * i;

        numR -= numRthr;


        /* assing part of the relS for next thread */

        args[i].relS.num_tuples = (i == (nthreads-1)) ? numS : numSthr;

        args[i].relS.tuples = relS->tuples + numSthr * i;

        numS -= numSthr;


        args[i].threadresult = &(joinresult->resultlist[i]);


        rv = pthread_create(&tid[i], &attr, npo_thread, (void*)&args[i]);

        if (rv){

            printf("ERROR; return code from pthread_create() is %d\n", rv);

            exit(-1);

        }


    }


    for(i = 0; i < nthreads; i++){

        pthread_join(tid[i], NULL);

        /* sum up results */

        result += args[i].num_results;

    }

    joinresult->totalresults = result;

    joinresult->nthreads     = nthreads;


#ifndef NO_TIMING

    /* now print the timing results: */

    print_timing(args[0].timer1, args[0].timer2, args[0].timer3,

                relS->num_tuples, result,

                &args[0].start, &args[0].end);

#endif


    destroy_hashtable(ht);


    return joinresult;

}


affinity.h
Affinity methods on Mac OS X. Mac OS X does not export interfaces that identify processors or control...

barrier.h
Barrier implementation, defaults to Pthreads. On Mac custom implementation since barriers are not inc...

BARRIER_ARRIVE
#define BARRIER_ARRIVE(B, RV)
Definition: barrier.h:29

cpu_mapping.h
Provides cpu mapping utility function.

generator.h
Provides methods to generate data sets of various types.

numa_localize
int numa_localize(tuple_t *relation, int64_t num_tuples, uint32_t nthreads)
Definition: generator.c:413

probe_hashtable
int64_t probe_hashtable(hashtable_t *ht, relation_t *rel, void *output)
Definition: no_partitioning_join.c:280

build_hashtable_st
void build_hashtable_st(hashtable_t *ht, relation_t *rel)
Definition: no_partitioning_join.c:231

destroy_hashtable
void destroy_hashtable(hashtable_t *ht)
Definition: no_partitioning_join.c:218

allocate_hashtable
void allocate_hashtable(hashtable_t **ppht, uint32_t nbuckets)
Definition: no_partitioning_join.c:183

NPO
result_t * NPO(relation_t *relR, relation_t *relS, int nthreads)
Definition: no_partitioning_join.c:592

npo_thread
void * npo_thread(void *param)
Definition: no_partitioning_join.c:492

NPO_st
result_t * NPO_st(relation_t *relR, relation_t *relS, int nthreads)
Definition: no_partitioning_join.c:357

build_hashtable_mt
void build_hashtable_mt(hashtable_t *ht, relation_t *rel, bucket_buffer_t **overflowbuf)
Definition: no_partitioning_join.c:427

init_bucket_buffer
void init_bucket_buffer(bucket_buffer_t **ppbuf)
Definition: no_partitioning_join.c:122

free_bucket_buffer
void free_bucket_buffer(bucket_buffer_t *buf)
Definition: no_partitioning_join.c:160

PCM_initPerformanceMonitor
void PCM_initPerformanceMonitor(const char *pcmcfg, const char *pcmout)
Definition: perf_counters.c:330

PCM_start
void PCM_start()
Definition: perf_counters.c:331

PCM_stop
void PCM_stop()
Definition: perf_counters.c:332

PCM_cleanup
void PCM_cleanup()
Definition: perf_counters.c:334

PCM_printResults
void PCM_printResults()
Definition: perf_counters.c:333

PCM_log
void PCM_log(char *msg)
Definition: perf_counters.c:337

get_cpu_id
int get_cpu_id(int thread_id)
Definition: cpu_mapping.c:78

DEBUGMSG
#define DEBUGMSG(COND, MSG,...)
Definition: no_partitioning_join.c:78

NEXT_POW_2
#define NEXT_POW_2(V)
Definition: no_partitioning_join.c:57

numalocalize
int numalocalize
Definition: generator.c:47

no_partitioning_join.h
The interface of No partitioning optimized (NPO) join algorithm.

npj_params.h
Constant parameters used by No Partitioning Join implementations.

BUCKET_SIZE
#define BUCKET_SIZE
Definition: npj_params.h:19

OVERFLOW_BUF_SIZE
#define OVERFLOW_BUF_SIZE
Definition: npj_params.h:29

CACHE_LINE_SIZE
#define CACHE_LINE_SIZE
Definition: npj_params.h:24

npj_types.h
Provides type definitions used by No Partitioning Join implementations.

perf_counters.h
An interface to the Intel Performance Counters Monitoring.

arg_t
Definition: no_partitioning_join.c:90

bucket_buffer_t
Definition: npj_types.h:61

bucket_t
Definition: npj_types.h:31

chainedtuplebuffer_t
Definition: tuple_buffer.h:39

hashtable_t
Definition: npj_types.h:53

relation_t
Definition: types.h:54

result_t
Definition: types.h:74

threadresult_t
Definition: types.h:67

tuple_t
Definition: types.h:45

tuple_buffer.h
Implements a chained-buffer storage model for tuples.