Changeset 1660:592058a16c4d

Show
Ignore:
Timestamp:
06/13/08 17:13:29 (2 months ago)
Author:
hadoopsandholm@…
Branch:
default
Message:

fixed corr reduce code

Location:
src/grid/hadoop
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • src/grid/hadoop/benchmarks/digg/topic-correlation-reduce

    r1659 r1660  
    11#! /usr/bin/python 
    22import sys 
     3from math import sqrt 
     4 
     5topic_desc = ['offbeat_news','gadgets','videos_comedy','world_news','tech_news','apple','health','videos_music','playable_web_games','security','environment','space','linux_unix','videos_educational','videos_people','politics','xbox_360','music','videos_animation','hardware','business_finance','2008_us_elections','programming','design','software','general_sciences','gaming_news','other_sports','movies','pc_games','mods','political_opinion','baseball','nintendo_wii','extreme_sports','microsoft','celebrity','playstation_3','television','tennis','videos_gaming','football','tech_deals','videos_sports','motorsport','basketball','hockey','golf','soccer','comics_animation','odd_stuff','educational','xbox','comedy','people','nintendo','playstation','pets_animals','travel_places','arts_culture','food_drink','autos'] 
     6topic_map = {} 
     7 
     8for i in range(0, len(topic_desc)): 
     9    topic_map["%d" % (i+1)] = topic_desc[i] 
     10topic_map['0'] = topic_map['1'] 
    311 
    412def add_value(key, value, map): 
     
    715    map[key] += value 
    816 
    9 users = 0 
    10 key_map = {} 
     17def corr_sort(x, y): 
     18    if x['corr'] < y['corr']: 
     19        return -1     
     20    elif x['corr'] > y['corr']: 
     21        return 1     
     22    return 0 
     23 
     24sums = {} 
     25topics = {} 
    1126for line in sys.stdin: 
    12     users += 1 
    1327    line = line.strip() 
    1428    cols = line.split('\t') 
    15     i = 1 
    16     topic_map = {} 
    17     while i < len(cols): 
    18         topic = cols[i] 
    19         prob = float(cols[i+1])  
    20         #print "%s_mean\t%.20f" % (topic, prob) 
    21         add_value("%s_mean" % topic, prob, key_map) 
    22         topic_map[topic] = prob 
    23         #print "%s_square\t%.20f" % (topic, prob**2) 
    24         add_value("%s_square" % topic, prob**2, key_map) 
    25         i += 2 
    26     for topic, val in topic_map.iteritems(): 
    27         for topic2, val2 in topic_map.iteritems(): 
    28             if topic != topic2: 
    29                 t1 = long(topic) 
    30                 t2 = long(topic2) 
    31                 t_1 = min(t1,t2) 
    32                 t_2 = max(t1,t2) 
    33                 #print "%d_%d_cross\t%.20f" % (t_1,t_2,val*val2) 
    34                 add_value("%d_%d_cross" % (t_1,t_2),val*val2, key_map) 
    35 print "%s\t%d" % ("users",users) 
    36 for key, value in key_map.iteritems(): 
    37     print "%s\t%.20f" % (key, value) 
    38     #print >>sys.stderr,"%s\t%.20f" % (key, value) 
     29    if cols[0].endswith("_mean"): 
     30        topics[cols[0].replace('_mean','')] = 0 
     31    add_value(cols[0], float(cols[1]), sums) 
     32topic_names = topics.keys() 
     33topic_names.sort(lambda x,y: int(x)-int(y)) 
     34corr_list = [] 
     35n = float(sums['users']) 
     36for topic1 in topic_names: 
     37    for topic2 in topic_names: 
     38        if topic1 == topic2: 
     39            continue 
     40        if not sums.has_key(topic1 + "_" + topic2 + "_cross"): 
     41            continue 
     42        sum1 = sums[topic1 + "_mean"] 
     43        sum2 = sums[topic2 + "_mean"] 
     44        sum12 = sums[topic1 + "_" + topic2 + "_cross"] 
     45        sqr1 = sums[topic1 + "_sqr"] 
     46        sqr2 = sums[topic2 + "_sqr"] 
     47        corr = (n * sum12 - sum1 * sum2)/(sqrt(n * sqr1-sum1**2)*sqrt(n*sqr2-sum2**2)) 
     48        corr_list.append({'t1':topic1,'t2':topic2,'corr': corr}) 
     49corr_list.sort(corr_sort) 
     50for corr in corr_list: 
     51    print "%.5f %s %s %s %s" % (corr['corr'],corr['t1'],topic_map[corr['t1']],corr['t2'],topic_map[corr['t2']])         
  • src/grid/hadoop/bin/createvm

    r1650 r1660  
    3232MEMORY=`cat ${HADOOP_ROOT}/hadoop.conf | grep memory_limit | awk '{print $2}'` 
    3333DISK=`cat ${HADOOP_ROOT}/hadoop.conf | grep disk_limit | awk '{print $2}'` 
     34CPU=`cat ${HADOOP_ROOT}/hadoop.conf | grep cpu_limit | awk '{print $2}'` 
    3435MEM=`${HADOOP_INSTALL}/bin/mult ${MEMORY} 1e-6` 
    3536DIS=`${HADOOP_INSTALL}/bin/mult ${DISK} 1e-9` 
    36 PREFS="CPU:${CPUWEIGHT},0.01,1.0 disk:${DISKWEIGHT},${DIS}GB,${DIS}GB memory:${MEMWEIGHT},${MEM}MB,${MEM}MB" 
     37CPU=`${HADOOP_INSTALL}/bin/mult ${CPU} 1e-6` 
     38 
     39PREFS="CPU:${CPUWEIGHT},${CPU}MHz,3.0GHz disk:${DISKWEIGHT},${DIS}GB,${DIS}GB memory:${MEMWEIGHT},${MEM}MB,${MEM}MB" 
    3740 
    3841MASTER=`cat ${HADOOP_ROOT}/.master 2>/dev/null`