Skip to content

Commit

Permalink
辅助文件
Browse files Browse the repository at this point in the history
  • Loading branch information
lzmy1993 committed Nov 12, 2015
1 parent 469335a commit 0a697eb
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 0 deletions.
104 changes: 104 additions & 0 deletions bloomFilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# -*- coding: UTF-8 -*-
import cmath
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from BitVector import BitVector

class BloomFilter(object):
def __init__(self,error_rate,elementNum):
#计算所需要的bit数
self.bit_num = -1 * elementNum * cmath.log(error_rate) / (cmath.log(2.0) * cmath.log(2.0))

#四字节对齐
self.bit_num = self.align_4byte(self.bit_num.real)

#分配内存
self.bit_array = BitVector(size = self.bit_num)

#计算hash函数个数
self.hash_num = cmath.log(2) * self.bit_num / elementNum
self.hash_num = self.hash_num.real

#向上取整
self.hash_num = int(self.hash_num) + 1

#产生hash函数种子
self.hash_seeds = self.generate_hashseeds(self.hash_num)


def insert_element(self,element):
for seed in self.hash_seeds:
hash_val = self.hash_element(element,seed)
#取绝对值
hash_val = abs(hash_val)
#取模,防越界
hash_val = hash_val % self.bit_num
#设置相应的比特位
self.bit_array[hash_val] = 1;


#检查元素是否存在,存在返回true,否则返回false
def is_element_exist(self,element):
for seed in self.hash_seeds:
hash_val = self.hash_element(element,seed)
#取绝对值
hash_val = abs(hash_val)
#取模,防越界
hash_val = hash_val % self.bit_num
#查看值
if self.bit_array[hash_val] == 0:
return False
return True


#内存对齐
def align_4byte(self,bit_num):
num = int(bit_num / 32)
num = 32 * (num + 1)
return num


#产生hash函数种子,hash_num个素数
def generate_hashseeds(self,hash_num):
count = 0
#连续两个种子的最小差值
gap = 50
#初始化hash种子为0
hash_seeds = [];
for index in xrange(hash_num):
hash_seeds.append(0)
for index in xrange(10,10000):
max_num = int(cmath.sqrt(1.0 * index).real)
flag = 1
for num in xrange(2,max_num):
if index % num == 0:
flag = 0
break

if flag == 1:
#连续两个hash种子的差值要大才行
if count > 0 and (index - hash_seeds[count - 1]) < gap:
continue
hash_seeds[count] = index
count +=1

if count == hash_num:
break
return hash_seeds


def hash_element(self,element,seed):
hash_val = 1
for ch in str(element):
chval = ord(ch)
hash_val = hash_val * seed + chval
return hash_val








25 changes: 25 additions & 0 deletions download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- coding: UTF-8 -*-
import urllib
import uuid
endWithArr = ['.pdf','.html','.doc','.docx','.png','.jpg','.gif','.txt','.xml','.ppt','.xls','.xlsx']

def cbk(a, b, c):
'''回调函数
@a: 已经下载的数据块
@b: 数据块的大小
@c: 远程文件的大小
'''
per = 100.0 * a * b / c
if per > 100:
per = 100
print '%.2f%%' % per
def downloadHtml(url):
fileName = uuid.uuid1().get_hex()
for str in endWithArr:
if url.endswith(str):
fileName = fileName + str
break
else:
fileName = fileName + '.html'
local = '在这里输入你想要保存下载文件的路径/%s' % (fileName)
urllib.urlretrieve(url, local, cbk)
18 changes: 18 additions & 0 deletions downloadVideo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import re
import urllib
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getMp4(html):
r = r"http.*\.mp4"
re_mp4 = re.compile(r)
mp4List = re.findall(re_mp4,html)
filename = 1
for mp4url in mp4List:
urllib.urlretrieve(mp4url,"%s.mp4" %filename)
print 'file "%s.mp4" done' %filename
filename+=1
url = raw_input("please input the source url:")
html = getHtml(url)
getMp4(html)

0 comments on commit 0a697eb

Please sign in to comment.