'''
Created on Jan 12, 2015

@author: santhosh
'''

from __future__ import division

import numpy
import re
import sys

from util import PlotUtil


skip_words = '[\n]|[ ]|[\t]'


sample_m = [ [10,6,4,2,7,1,8,5,3,9], [6,4,5,3,7,8,2,10,9,1],
            [5,2,4,8,10,9,1,3,7,6],[8,6,1,7,9,4,2,10,3,5],
            [8,4,9,1,10,3,7,5,2,6],[6,5,2,7,8,9,10,1,3,4],
            [4,1,9,2,10,7,8,6,3,5],[4,9,8,5,3,2,6,7,1,10],
            [1,5,9,6,7,8,10,4,2,3]]

def createAndReturnShingles(texts, n):
    shingle_dict = dict()
    text_index = 0
    for text in texts:
        shingle = ''
        for t in text:
            if t != ' ' and t!='\x00':
                shingle = shingle+t
            if(len(shingle) == n):
                if shingle not in shingle_dict:
                    shingle_dict[shingle] = numpy.zeros(len(texts))        
                shingle_dict[shingle][text_index] = 1
                shingle = ''
    
        if shingle != '':
            if shingle not in shingle_dict:
                    shingle_dict[shingle] = numpy.zeros(len(texts))
            shingle_dict[shingle][text_index] = 1
            
        text_index+=1    
    return shingle_dict

def createAndReturnWords(texts):
    shingle_dict = dict()
    text_index = 0
    for text in texts:
        lines = []
        try:
            lines = text.splitlines()
        except:
            e = sys.exc_info()[0]
            print text, e
        for line in lines:
            words = []
            for word in line.split():
                try :
                    word = word.decode('unicode_escape').encode('ascii','ignore')
                    words.append(word)
                except UnicodeDecodeError as ex:
#                     print word
#                     print ex
                    words.append(word)
                    
            for word in words:
                if word not in shingle_dict:
                    shingle_dict[word] = numpy.zeros(len(texts))
                shingle_dict[word][text_index] = 1
        text_index+=1
    return shingle_dict

def formDataMatrix(texts):
    
    shingle_dict = createAndReturnWords(texts)
    
    n = len(shingle_dict.keys())
    m = len(texts)
    data_matrix = numpy.zeros(shape =(n,m))
    shingle_list = []
    
    index = 0
    for key in shingle_dict.keys():
        data_matrix[index] = shingle_dict[key]
        shingle_list.append(key)
        index+=1

    #print shingle_list, data_matrix
    return data_matrix

# Autogenerated with SMOP version 
# main.py /media/santhosh/Data/workspace/datalab/data/src/jac_doc_hash.m

def jac_doc_hash(A,r,b):
    n,m= A.shape
    s=r*b
    S=numpy.zeros(shape=(s,m))
    M=numpy.zeros(shape=(s,n))
    for i in numpy.arange(0,s).reshape(-1):
        M[i,:]=numpy.random.permutation(numpy.arange(1,n+1))
    #M = numpy.array(sample_m)
    #print A
    #print M
    
    
    for i in numpy.arange(0,m).reshape(-1):
        out = A[:,i]
        index = 0
        if(sum(out) > 0):
            temp_min_for_iter = ''
            for index in range(len(out)):
                if out[index] > 0:
                    if temp_min_for_iter == '':
                        temp_min_for_iter = numpy.array(M[:,index])
                    else:
                        temp_min_for_iter = numpy.vstack((temp_min_for_iter, M[:,index]))
#             print numpy.amin(temp_min_for_iter,axis=0)
#             print '----------------------------------------'
            S[:,i]=numpy.amin(temp_min_for_iter,axis=0)
#    print S

    M = numpy.zeros(shape = (s,n))
    A = numpy.zeros(shape = A.shape)
    
    #maps=numpy.zeros(shape=(b,1))
    maps = {key:0 for key in range(b)}
    
    for j in numpy.arange(0,b).reshape(-1):
        from_= 0 + r * (j)
        to = from_ + (r)
        c= dict()
        for i in numpy.arange(0,m).reshape(-1):
            #print from_, to, j, i, S[from_:to,i] 
            t=S[from_:to,i]
            t=str(t)
            if (t in c):
                c[t] = numpy.append(c[t], i)
            else:
                c[t] = numpy.array([i])
        maps[j]= c
        
    S = numpy.zeros(shape = (s,m))
    
    candidategroups = numpy.arange(0,m)
    
    for i in numpy.arange(0,b).reshape(-1):
        c = maps[i]
        k = c.keys()
        for j in numpy.arange(0,len(k)).reshape(-1):
            candidategroups[c[k[j]]] = min(candidategroups[c[k[j]]])
            
    ucg=set(candidategroups)
    
    num_clusters=len(ucg)
        
    return candidategroups

# r= 4 and b = 10 --> 0.5  
# r = 20 and b = 50 --> 0.8
def s_curve(r = 20,b = 50):
    s = numpy.arange(0,1,0.01)
    p = 1 - (1 - s ** r) ** b
    PlotUtil.plotCurve(s, p)
