diff --git a/bloomFilter.py b/bloomFilter.py new file mode 100644 index 0000000..f1074dd --- /dev/null +++ b/bloomFilter.py @@ -0,0 +1,104 @@ +# -*- coding: UTF-8 -*- +import cmath +import sys +reload(sys) +sys.setdefaultencoding('utf8') +from BitVector import BitVector + +class BloomFilter(object): + def __init__(self,error_rate,elementNum): + #计算所需要的bit数 + self.bit_num = -1 * elementNum * cmath.log(error_rate) / (cmath.log(2.0) * cmath.log(2.0)) + + #四字节对齐 + self.bit_num = self.align_4byte(self.bit_num.real) + + #分配内存 + self.bit_array = BitVector(size = self.bit_num) + + #计算hash函数个数 + self.hash_num = cmath.log(2) * self.bit_num / elementNum + self.hash_num = self.hash_num.real + + #向上取整 + self.hash_num = int(self.hash_num) + 1 + + #产生hash函数种子 + self.hash_seeds = self.generate_hashseeds(self.hash_num) + + + def insert_element(self,element): + for seed in self.hash_seeds: + hash_val = self.hash_element(element,seed) + #取绝对值 + hash_val = abs(hash_val) + #取模,防越界 + hash_val = hash_val % self.bit_num + #设置相应的比特位 + self.bit_array[hash_val] = 1; + + + #检查元素是否存在,存在返回true,否则返回false + def is_element_exist(self,element): + for seed in self.hash_seeds: + hash_val = self.hash_element(element,seed) + #取绝对值 + hash_val = abs(hash_val) + #取模,防越界 + hash_val = hash_val % self.bit_num + #查看值 + if self.bit_array[hash_val] == 0: + return False + return True + + + #内存对齐 + def align_4byte(self,bit_num): + num = int(bit_num / 32) + num = 32 * (num + 1) + return num + + + #产生hash函数种子,hash_num个素数 + def generate_hashseeds(self,hash_num): + count = 0 + #连续两个种子的最小差值 + gap = 50 + #初始化hash种子为0 + hash_seeds = []; + for index in xrange(hash_num): + hash_seeds.append(0) + for index in xrange(10,10000): + max_num = int(cmath.sqrt(1.0 * index).real) + flag = 1 + for num in xrange(2,max_num): + if index % num == 0: + flag = 0 + break + + if flag == 1: + #连续两个hash种子的差值要大才行 + if count > 0 and (index - hash_seeds[count - 1]) < gap: + continue + hash_seeds[count] = index + count +=1 + + if count == hash_num: + break + return hash_seeds + + + def hash_element(self,element,seed): + hash_val = 1 + for ch in str(element): + chval = ord(ch) + hash_val = hash_val * seed + chval + return hash_val + + + + + + + + diff --git a/download.py b/download.py new file mode 100644 index 0000000..96adb4c --- /dev/null +++ b/download.py @@ -0,0 +1,25 @@ +# -*- coding: UTF-8 -*- +import urllib +import uuid +endWithArr = ['.pdf','.html','.doc','.docx','.png','.jpg','.gif','.txt','.xml','.ppt','.xls','.xlsx'] + +def cbk(a, b, c): + '''回调函数 + @a: 已经下载的数据块 + @b: 数据块的大小 + @c: 远程文件的大小 + ''' + per = 100.0 * a * b / c + if per > 100: + per = 100 + print '%.2f%%' % per +def downloadHtml(url): + fileName = uuid.uuid1().get_hex() + for str in endWithArr: + if url.endswith(str): + fileName = fileName + str + break + else: + fileName = fileName + '.html' + local = '在这里输入你想要保存下载文件的路径/%s' % (fileName) + urllib.urlretrieve(url, local, cbk) \ No newline at end of file diff --git a/downloadVideo.py b/downloadVideo.py new file mode 100644 index 0000000..67af787 --- /dev/null +++ b/downloadVideo.py @@ -0,0 +1,18 @@ +import re +import urllib +def getHtml(url): + page = urllib.urlopen(url) + html = page.read() + return html +def getMp4(html): + r = r"http.*\.mp4" + re_mp4 = re.compile(r) + mp4List = re.findall(re_mp4,html) + filename = 1 + for mp4url in mp4List: + urllib.urlretrieve(mp4url,"%s.mp4" %filename) + print 'file "%s.mp4" done' %filename + filename+=1 +url = raw_input("please input the source url:") +html = getHtml(url) +getMp4(html) \ No newline at end of file