ext/text_rank/page_rank_sparse_native.c
#include <ruby.h>
#include <math.h>
#include <page_rank_sparse_native.h>
const size_t NODE_LIST_SIZE = sizeof(NodeListStruct);
const size_t EDGE_LIST_SIZE = sizeof(EdgeListStruct);
const size_t NODE_SIZE = sizeof(NodeStruct);
const size_t EDGE_SIZE = sizeof(EdgeStruct);
const size_t GRAPH_SIZE = sizeof(GraphStruct);
static const rb_data_type_t graph_typed_data = {
"PageRank/SparseNative/Graph",
{ 0, free_graph, },
0, 0,
RUBY_TYPED_FREE_IMMEDIATELY,
};
//////////////////////////////////////////////////////////////////////////////////////
void Init_sparse_native() {
VALUE PageRankModule, SparseNativeClass;
PageRankModule = rb_const_get(rb_cObject, rb_intern("PageRank"));
SparseNativeClass = rb_const_get(PageRankModule, rb_intern("SparseNative"));
rb_define_alloc_func(SparseNativeClass, sparse_native_allocate);
rb_define_private_method(SparseNativeClass, "_add_edge", sparse_native_add_edge, 3);
rb_define_private_method(SparseNativeClass, "_calculate", sparse_native_calculate, 3);
}
VALUE sparse_native_allocate(VALUE self) {
Graph g = malloc(GRAPH_SIZE);
// Grab a reference to the hash type used by a generic Ruby {}
// which accepts any key and any value. We'll need this type to create
// a st_table in which to put arbitrary VALUE keys. This hash type
// should be a static constant and thus should be safe to utilize without
// fear of garbage collection.
const struct st_hash_type *objhash = rb_hash_tbl(rb_hash_new(), "page_rank_sparse_native.c", 40)->type;
g->node_count = 0;
g->nodes = NULL;
g->dangling_nodes = NULL;
g->node_lookup = st_init_table_with_size(objhash, 0);
return TypedData_Wrap_Struct(self, &graph_typed_data, g);
}
VALUE sparse_native_add_edge(VALUE self, VALUE source, VALUE dest, VALUE weight) {
Graph g;
TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
add_edge_with_labels(g, source, dest, NUM2DBL(weight));
return Qnil;
}
VALUE sparse_native_calculate(VALUE self, VALUE max_iterations, VALUE damping, VALUE tolerance) {
Graph g;
VALUE ranks;
TypedData_Get_Struct(self, GraphStruct, &graph_typed_data, g);
calculate(g, FIX2INT(max_iterations), NUM2DBL(damping), NUM2DBL(tolerance));
ranks = rb_hash_new();
sort_and_normalize_ranks(g, rb_hash_dset, ranks);
return ranks;
}
void rb_hash_dset(VALUE hash, VALUE key, double value) {
rb_hash_aset(hash, key, DBL2NUM(value));
}
//////////////////////////////////////////////////////////////////////////////////////
void free_graph(void *data) {
Graph g = (Graph)data;
free_node_list(g->nodes, free_node);
free_node_list(g->dangling_nodes, NULL);
free(g->node_lookup);
free(g);
}
void free_node(Node n) {
free_edge_list(n->source_edges, free_edge);
free(n);
}
void free_node_list(NodeList nodes, void (*free_item)(Node)) {
while (nodes != NULL) {
NodeList tmp = nodes;
nodes = nodes->next;
if (free_item) {
free_item(tmp->node);
}
free(tmp);
}
}
void free_edge(Edge e) {
// Assume source node was allocated elsewhere and will be free'd elsewhere
free(e);
}
void free_edge_list(EdgeList edges, void (*free_item)(Edge)) {
while (edges != NULL) {
EdgeList tmp = edges;
edges = edges->next;
if (free_item) {
free_item(tmp->edge);
}
free(tmp);
}
}
//////////////////////////////////////////////////////////////////////////////////////
Node add_node(Graph g, VALUE label) {
NodeList tmp = malloc(NODE_LIST_SIZE);
tmp->node = malloc(NODE_SIZE);
tmp->node->label = label;
tmp->node->source_edges = NULL;
tmp->node->rank = 0.0;
tmp->node->prev_rank = 0.0;
tmp->node->outbound_weight_total = 0.0;
tmp->next = g->nodes;
g->nodes = tmp;
g->node_count += 1;
return tmp->node;
}
Node add_dangling_node(Graph g, Node n) {
NodeList tmp = malloc(NODE_LIST_SIZE);
tmp->node = n;
tmp->next = g->dangling_nodes;
g->dangling_nodes = tmp;
return n;
}
Edge add_edge(Node source, Node destination, double weight) {
EdgeList tmp = malloc(EDGE_LIST_SIZE);
tmp->edge = malloc(EDGE_SIZE);
tmp->edge->source = source;
tmp->edge->weight = weight;
tmp->next = destination->source_edges;
destination->source_edges = tmp;
source->outbound_weight_total += weight;
return tmp->edge;
}
Edge add_edge_with_labels(Graph g, VALUE source_label, VALUE dest_label, double weight) {
Node source, dest;
source = lookup_node(g, source_label);
dest = lookup_node(g, dest_label);
return add_edge(source, dest, weight);
}
Node lookup_node(Graph g, VALUE label) {
Node n;
if (!st_lookup(g->node_lookup, (st_data_t)label, (st_data_t *)&n)) {
n = add_node(g, label);
st_add_direct(g->node_lookup, (st_data_t)label, (st_data_t)n);
}
return n;
}
//////////////////////////////////////////////////////////////////////////////////////
void calculate_start(Graph g) {
NodeList nodes;
Node source, destination;
EdgeList edges;
Edge e;
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
destination = nodes->node;
// If there is no outband, this is a "dangling" node
if (destination->outbound_weight_total == 0.0) {
add_dangling_node(g, destination);
}
// Normalize all source edge weights
for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
e = edges->edge;
source = e->source;
e->weight = e->weight / source->outbound_weight_total;
}
// Set the initial rank
destination->prev_rank = 0;
destination->rank = 1.0 / g->node_count;
}
}
void calculate_step(Graph g, double damping) {
NodeList nodes, dangling_nodes;
Node source, destination;
EdgeList edges;
Edge e;
// Set prev rank to rank for all nodes
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
destination = nodes->node;
destination->prev_rank = destination->rank;
}
// Re-destribute the rankings according to weight
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
destination = nodes->node;
double sum = 0.0;
for (edges = destination->source_edges; edges != NULL; edges = edges->next) {
e = edges->edge;
source = e->source;
sum += source->prev_rank * e->weight;
}
for (dangling_nodes = g->dangling_nodes; dangling_nodes != NULL; dangling_nodes = dangling_nodes->next) {
source = dangling_nodes->node;
sum += source->prev_rank / g->node_count;
}
destination->rank = damping * sum + (1 - damping) / g->node_count;
}
}
// Calculate the Euclidean distance from prev_rank to rank across all nodes
double prev_distance(Graph g) {
double sum_squares = 0.0;
for (NodeList nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
Node n = nodes->node;
double rank_diff = n->prev_rank - n->rank;
sum_squares += rank_diff * rank_diff;
}
return sqrt(sum_squares);
}
void calculate(Graph g, int max_iterations, double damping, double tolerance) {
calculate_start(g);
while (max_iterations != 0) { // If negative one, allow to go without limit
calculate_step(g, damping);
if (prev_distance(g) < tolerance) {
break;
}
max_iterations--;
}
}
int node_compare(const void *v1, const void *v2) {
double rank1 = (*(Node *)v1)->rank;
double rank2 = (*(Node *)v2)->rank;
double cmp = rank2 - rank1; // Decreasing order
if (cmp < 0) return -1;
if (cmp > 0) return 1;
return 0;
}
void sort_and_normalize_ranks(Graph g, void (*callback)(VALUE, VALUE, double), VALUE callback_arg) {
NodeList nodes;
Node n;
double sum = 0.0;
unsigned long i;
Node *tmp;
i = g->node_count;
tmp = malloc(g->node_count * sizeof(Node));
for (nodes = g->nodes; nodes != NULL; nodes = nodes->next) {
n = nodes->node;
tmp[--i] = n;
sum += n->rank;
}
qsort(tmp, g->node_count, sizeof(Node), node_compare);
for (i = 0; i < g->node_count; i++) {
n = tmp[i];
callback(callback_arg, n->label, n->rank / sum);
}
free(tmp);
}