-
Notifications
You must be signed in to change notification settings - Fork 21
/
search.py
150 lines (131 loc) · 4.73 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# coding=utf-8
import re
import urllib.request
from multiprocessing import Pool
from urllib.request import urlopen
from bs4 import BeautifulSoup
import utils
default_max_wait_time = 3 # 默认最大等待时间3秒
option_split_word = ['的', '之', '、', '和']
def search(question, option_arr, is_negative):
wd = urllib.request.quote(question)
pool = Pool()
source_1 = pool.apply_async(search_baidu, args=(wd, option_arr))
source_2 = pool.apply_async(search_zhidao, args=(wd, option_arr))
pool.close()
# pool.join()
source_arr = get_source(source_1, source_2)
print('分数统计是:{}'.format(source_arr))
best_answer = get_result(source_arr, option_arr, is_negative)
return best_answer
# 百度搜索
def search_baidu(question, option_arr):
result_list = []
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
head = {}
head['User-Agent'] = user_agent
url = 'https://www.baidu.com/s?wd={}'.format(question)
print(url)
request = urllib.request.Request(url, headers=head)
result = urlopen(request)
body = BeautifulSoup(result.read(), 'html5lib')
content_list = body.find('div', id='content_left')
if content_list is None:
return [0, 0, 0]
content_list = content_list.findAll('div')
for content in content_list:
content_text = content.get_text()
content_text = re.sub('\s', '', content_text)
result_list.append(content_text)
answer_num = len(result_list)
source_arr = []
op_num = len(option_arr)
for i in range(0, op_num):
source_arr.append(0)
for i in range(0, answer_num):
result = result_list[i]
for j in range(0, op_num):
op = option_arr[j]
if op in result: # 选项在答案中出现一次,加10分
source_arr[j] += 5
return source_arr
# 百度知道搜题
def search_zhidao(question, option_arr):
result_list = []
url = 'https://zhidao.baidu.com/search?word={}'.format(
question)
print(url)
result = urlopen(url)
# 解析页面
body = BeautifulSoup(result.read(), 'html5lib')
good_result_div = body.find(class_='list-header').find('dd')
second_result_div = body.find(class_='list-inner').find(class_='list')
if good_result_div is not None:
good_result = good_result_div.get_text()
result_list.append(good_result)
if second_result_div is not None:
second_result_10 = second_result_div.findAll('dl') # .find(class_='answer').get_text()
if second_result_10 is not None and len(second_result_10) > 0:
for each_result in second_result_10:
result_dd = each_result.get_text()
result_text = re.sub('\s', '', result_dd)
result_list.append(result_text)
print(result_text)
answer_num = len(result_list)
source_arr = []
op_num = len(option_arr)
for i in range(0, op_num):
source_arr.append(0)
for i in range(0, answer_num):
result = result_list[i]
for j in range(0, op_num):
op = option_arr[j]
op_arr = split_option(op) # 对选项进行简单分词搜索,如
if op_arr is not None:
for op_wd in op_arr:
if op_wd in result:
source_arr[j] += 5
if op in result: # 选项在答案中出现一次,加10分
source_arr[j] += 10
if re.search('[答案|结果|而是].{4}' + op, result) is not None:
source_arr[j] += 20
return source_arr
def get_result(source_arr, option_arr, is_negate):
if len(source_arr) == 0 or max(source_arr) == 0:
return None
if is_negate:
best_index = min(source_arr)
else:
best_index = max(source_arr)
best_result = option_arr[source_arr.index(best_index)]
for num in source_arr:
print(num)
return best_result
def get_source(source_1, source_2):
s1, s2 = [], []
try:
s1 = source_1.get(default_max_wait_time)
except BaseException:
s1 = [0, 0, 0]
try:
s2 = source_2.get(default_max_wait_time)
except BaseException:
s2 = [0, 0, 0]
print('百度网页搜索结果:{}'.format(s1))
print('百度知道结果:{}.'.format(s2))
source_arr = utils.over_add(s1, s2)
return source_arr
def split_option(option):
option_arr = []
for wd in option_split_word:
option_arr = option.split(wd)
if len(option_arr) > 1:
break
if len(option_arr) > 1:
return option_arr
else:
return None
if __name__ == '__main__':
opt = split_option('我和书')
print(opt)