-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
147 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# -*- coding: UTF-8 -*- | ||
import cmath | ||
import sys | ||
reload(sys) | ||
sys.setdefaultencoding('utf8') | ||
from BitVector import BitVector | ||
|
||
class BloomFilter(object): | ||
def __init__(self,error_rate,elementNum): | ||
#计算所需要的bit数 | ||
self.bit_num = -1 * elementNum * cmath.log(error_rate) / (cmath.log(2.0) * cmath.log(2.0)) | ||
|
||
#四字节对齐 | ||
self.bit_num = self.align_4byte(self.bit_num.real) | ||
|
||
#分配内存 | ||
self.bit_array = BitVector(size = self.bit_num) | ||
|
||
#计算hash函数个数 | ||
self.hash_num = cmath.log(2) * self.bit_num / elementNum | ||
self.hash_num = self.hash_num.real | ||
|
||
#向上取整 | ||
self.hash_num = int(self.hash_num) + 1 | ||
|
||
#产生hash函数种子 | ||
self.hash_seeds = self.generate_hashseeds(self.hash_num) | ||
|
||
|
||
def insert_element(self,element): | ||
for seed in self.hash_seeds: | ||
hash_val = self.hash_element(element,seed) | ||
#取绝对值 | ||
hash_val = abs(hash_val) | ||
#取模,防越界 | ||
hash_val = hash_val % self.bit_num | ||
#设置相应的比特位 | ||
self.bit_array[hash_val] = 1; | ||
|
||
|
||
#检查元素是否存在,存在返回true,否则返回false | ||
def is_element_exist(self,element): | ||
for seed in self.hash_seeds: | ||
hash_val = self.hash_element(element,seed) | ||
#取绝对值 | ||
hash_val = abs(hash_val) | ||
#取模,防越界 | ||
hash_val = hash_val % self.bit_num | ||
#查看值 | ||
if self.bit_array[hash_val] == 0: | ||
return False | ||
return True | ||
|
||
|
||
#内存对齐 | ||
def align_4byte(self,bit_num): | ||
num = int(bit_num / 32) | ||
num = 32 * (num + 1) | ||
return num | ||
|
||
|
||
#产生hash函数种子,hash_num个素数 | ||
def generate_hashseeds(self,hash_num): | ||
count = 0 | ||
#连续两个种子的最小差值 | ||
gap = 50 | ||
#初始化hash种子为0 | ||
hash_seeds = []; | ||
for index in xrange(hash_num): | ||
hash_seeds.append(0) | ||
for index in xrange(10,10000): | ||
max_num = int(cmath.sqrt(1.0 * index).real) | ||
flag = 1 | ||
for num in xrange(2,max_num): | ||
if index % num == 0: | ||
flag = 0 | ||
break | ||
|
||
if flag == 1: | ||
#连续两个hash种子的差值要大才行 | ||
if count > 0 and (index - hash_seeds[count - 1]) < gap: | ||
continue | ||
hash_seeds[count] = index | ||
count +=1 | ||
|
||
if count == hash_num: | ||
break | ||
return hash_seeds | ||
|
||
|
||
def hash_element(self,element,seed): | ||
hash_val = 1 | ||
for ch in str(element): | ||
chval = ord(ch) | ||
hash_val = hash_val * seed + chval | ||
return hash_val | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# -*- coding: UTF-8 -*- | ||
import urllib | ||
import uuid | ||
endWithArr = ['.pdf','.html','.doc','.docx','.png','.jpg','.gif','.txt','.xml','.ppt','.xls','.xlsx'] | ||
|
||
def cbk(a, b, c): | ||
'''回调函数 | ||
@a: 已经下载的数据块 | ||
@b: 数据块的大小 | ||
@c: 远程文件的大小 | ||
''' | ||
per = 100.0 * a * b / c | ||
if per > 100: | ||
per = 100 | ||
print '%.2f%%' % per | ||
def downloadHtml(url): | ||
fileName = uuid.uuid1().get_hex() | ||
for str in endWithArr: | ||
if url.endswith(str): | ||
fileName = fileName + str | ||
break | ||
else: | ||
fileName = fileName + '.html' | ||
local = '在这里输入你想要保存下载文件的路径/%s' % (fileName) | ||
urllib.urlretrieve(url, local, cbk) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import re | ||
import urllib | ||
def getHtml(url): | ||
page = urllib.urlopen(url) | ||
html = page.read() | ||
return html | ||
def getMp4(html): | ||
r = r"http.*\.mp4" | ||
re_mp4 = re.compile(r) | ||
mp4List = re.findall(re_mp4,html) | ||
filename = 1 | ||
for mp4url in mp4List: | ||
urllib.urlretrieve(mp4url,"%s.mp4" %filename) | ||
print 'file "%s.mp4" done' %filename | ||
filename+=1 | ||
url = raw_input("please input the source url:") | ||
html = getHtml(url) | ||
getMp4(html) |