Friday, October 2, 2009

Powers of Binomials in C

Expanding (a + b)^n yields the summation of the 2^n terms. This program will show you the expanded formula. The example below shows (1.6 + 0.4)^3 expanded and solved:

$ ./binomial 1.6 0.4 3
(1.600000 + 0.400000)^3
= (1.600000 + 0.400000)(1.600000 + 0.400000)(1.600000 + 0.400000)
= (1.600000)(1.600000)(1.600000) + (1.600000)(1.600000)(0.400000) + (1.600000)(0.400000)(1.600000) + (1.600000)(0.400000)(0.400000) + (0.400000)(1.600000)(1.600000) + (0.400000)(1.600000)(0.400000) + (0.400000)(0.400000)(1.600000) + (0.400000)(0.400000)(0.400000)
= 8.000000



Rather than use the FOIL method to compute the 2^20 terms of the expansion, it makes use of the fact that all of the terms consist of of either 4 or 2; a binary dichotomy. Instead an algorithm to convert a the value of a variable of type long to a binary representation is used.


$ time ./binomial 4.0 2.0 20 | tail -n 1
= 3656158440062976.000000

real 0m17.482s
user 0m16.650s
sys 0m0.830s


It may be slow, but at least accurately computed (4 + 2)^20 properly using a very similar algorithm to that of a person doing algebra with pencil and paper who had never heard of the Binomial Theorem.
For solving Powers of Binomials problems, using Binomial Theorem instead would have been an approach which used less running time, of course. Or the following:

double result = pow(a + b, exp);


Here is my binomial.c file:


/*
* Copyright (c) 2009, Sean A.O. Harney
*
* Calculate (a+b)^exp
* TODO: Breaks with exponents larger than 30, check for arch specific limit when parsing cmdline args
* TODO: Change it to use C99 fixed width integer types instead of long.
*/

#include <stdio.h>
#include <stdlib.h>

double binomial_expansion(double a, double b, int exp);
double calcterm(unsigned long x, int width, double a, double b);

int main(int argc, char *argv[])
{
double a, b;
int exp; //positive.
char *endptr;

if (argc < 4)
{
fprintf(stderr, "Usage:\t%s a b exp\n", argv[0]);
exit(1);
}

a = strtod(argv[1], &endptr); // C89
if (endptr == NULL)
{
perror("strtod()");
exit(1);
}

b = strtod(argv[2], &endptr);
if (endptr == NULL)
{
perror("strtod()");
exit(1);
}

exp = strtol(argv[3], &endptr, 10); // atoi() does not error check
if (endptr == NULL)
{
perror("strtol()");
exit(1);
}
if (exp <= 0)
{
fprintf(stderr, "exp must be positive integer.\n");
exit(1);
}

binomial_expansion(a, b, exp);
exit(0);
}


// return value as computed
double binomial_expansion(double a, double b, int exp)
{
long num_terms; // will be 2^exp terms
long i;
double total = 0;

printf("(%f + %f)^%d\n\t= ", a, b, exp);

for (i = 0, num_terms = 2; i < exp; i++)
{
if (i < exp - 1)
num_terms *= 2; //raise num_terms by one power for all iteration but last
printf("(%f + %f)", a, b);
}


printf("\n\t= ");
for (i = 0; i < num_terms; i++)
{
total += calcterm(i, exp, a, b);
if (i < num_terms - 1)
printf(" + ");
}

printf("\n\t= %f\n", total);

return total;
}


// modified from a convert int to binary string representation function
double calcterm(unsigned long x, int width, double a, double b)
{
double s[64]; // max exp size
int i = width - 1;
double total = 0.0;

do
{
s[i--] = (x & 1) ? b : a; // b for 1, a for 0
x >>= 1;
}
while (x > 0);
while (i >= 0)
s[i--] = a; // fill with a (0), all must be fixed width

for (i = 0; i < width; i++)
{
if (i == 0)
total = s[0];
else
total *= s[i];
printf("(%f)", s[i]);
}

return total;
}

Tuesday, July 7, 2009

Grouping blogger.com blogs based on the adsense account. Does Google do enough to protect its user's privacy?

Hello, as you may be aware, Google's popular blogger.com/blogspot.com blogging site allows you to easily monetize your blog by serving Adsense advertisements on it.

While a blogger has the option of not displaying their profile on the page, it is still possible to group who owns which blogs because of the Adsense publisher id!


/*
* 7/7/2009
*
* gcc -lcurl -I/usr/include/libxml2 -lxml2 -o getblognames getblognames.c
*
* Generates a list of blogspot.com/blogger.com blog URLs
* by accessing http://www.blogger.com/next-blog to get a random blog redirect.
*
* Based on http://cool.haxx.se/cvs.cgi/curl/docs/examples/getinmemory.c
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <curl/curl.h>
#include <curl/types.h>
#include <curl/easy.h>

#include <libxml/HTMLparser.h>





struct MemoryStruct {
char *memory;
size_t size;
};

static void *myrealloc(void *ptr, size_t size);

static void *myrealloc(void *ptr, size_t size)
{
/* There might be a realloc() out there that doesn't like reallocing
NULL pointers, so we take care of it here */
if (ptr)
return realloc(ptr, size);
else
return malloc(size);
}

static size_t
WriteMemoryCallback(void *ptr, size_t size, size_t nmemb, void *data)
{
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *) data;

mem->memory = myrealloc(mem->memory, mem->size + realsize + 1);
if (mem->memory)
{
memcpy(&(mem->memory[mem->size]), ptr, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
}
return realsize;
}

void print_ahrefs(xmlNode * a_node)
{
xmlNode *cur_node = NULL;

for (cur_node = a_node; cur_node; cur_node = cur_node->next)
{
if (cur_node->type == XML_ELEMENT_NODE)
{
if (!strcasecmp((char *) cur_node->name, "a"))
{
unsigned char *url =
xmlGetProp(cur_node, (xmlChar *) "href");
printf("%s\n", url);
fflush(stdout);
xmlFree(url);
}
}

print_ahrefs(cur_node->children);
}
}

int main(int argc, char **argv)
{
CURL *curl;
CURLcode res;

struct MemoryStruct chunk;

curl_global_init(CURL_GLOBAL_ALL);

curl = curl_easy_init();

curl_easy_setopt(curl, CURLOPT_URL,
"http://www.blogger.com/next-blog");

/* send all data to this function */
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);

/* we pass our 'chunk' struct to the callback function */
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *) &chunk);

while (1)
{
chunk.memory = NULL; /* we expect realloc(NULL, size) to work */
chunk.size = 0; /* no data at this point */

res = curl_easy_perform(curl);
if (res != 0)
{
fprintf(stderr, "%s\n", curl_easy_strerror(res));
exit(1);
}

if (chunk.memory)
{
//assumes it is a string
//printf("%s\n", chunk.memory);
htmlDocPtr doc =
htmlParseDoc((unsigned char *) chunk.memory, NULL);
xmlNode *root_element;

if (doc == NULL)
{
fprintf(stderr, "error parsing HTML\n");
exit(1);
}

root_element = xmlDocGetRootElement(doc);
print_ahrefs(root_element);
xmlFreeDoc(doc); //no htmlFreeDoc(), this seems to work to prevent memory leak however.
free(chunk.memory);
}

}
curl_easy_cleanup(curl);
curl_global_cleanup();

exit(0);
}


The above code will output random blogger URLs line by line. Using a persistent HTTP connection, it will continue to retreive http://www.blogger.com/next-blog and extract out the redirection link. It requires libcurl and libxml2 to compile.

The next program is a simple shell script which will parse the output of the above program and in turn create a CSV (comma seperated variable) file consisting of blogURL, AdsensePublisherID
If the blog does not have adsense, the second column will be left blank.



#!/bin/sh
#
# cat blogsURLlist.txt | ./getadpublishers.sh > blogads.csv


while read url ; do
echo "\"$url\" , \""`wget -qO- "$url" |grep google_ad_client| \
head -n 1| sed "s:google_ad_client[ ]*=[ ]*\"pub-\([0-9]*\)\";:\1:"`'"'
done


Now, to put it all into action:

./getblognames | ./getadpublishers.sh > blogs.xml


This will run indefinitely as there are millions of blogs, and it does not check for duplicates!
It may be a wise idea to run the getblognames program independently to generate lists of the blog URLs beforehand, but I will simply pipe it to the shell script here.

Now it will be trivial to determine which Google Adsense publisher IDs are shared between supposedly independent blogs!

If Google wants to protect the identity of their Advertisers perhaps they should assign multiple publisher IDs to each publisher, so that they can put a unique one on each page!

I will post the results when I have some, hopefully there will be lulz to be had!