# 获取帖子一共有多少页 defgetPageNum(self, page): soup = BeautifulSoup(page) #利用soup的select规则找出帖子总页码数 page = soup.select('li[style="margin-left:8px"]') for ii in page: nums = ii.get_text() break num = re.findall(r'[0-9]+(?=[^0-9]*$)', nums) result = ''.join(num) if result: return result else: returnNone
defsetFileTitle(self, title): # 如果标题不是为None,即成功获取到标题 if title isnotNone: self.file = open(title + ".txt", "w+") else: self.file = open(self.defaultTitle + ".txt", "w+")
# 获取每一层楼的内容,传入页面内容 defgetContent(self, page): # 匹配所有楼层的内容 contents1 = [] contents2 = [] soup = BeautifulSoup(page) #获取帖子发布时间 time = soup.select('div[class="post-tail-wrap"]') for ii in time: texts = ii.get_text(' ') #将无用信息去除掉 texts2 = texts.rjust(80).replace(u'举报 | 侵权举报 有害信息举报 ', '') contents1.append(texts2 + '\n' + '\n' + '-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-') # 提取正文内容 items = soup.select('cc') for iii in items: contents2.append(iii.get_text().encode('utf8').lstrip()) #将两个list合并为一个list 重叠合并 contents = list(chain.from_iterable(zip(contents2, contents1))) return contents
# 向文件写入每一楼的信息 defwriteData(self, contents): for item in contents: self.file.write(item+'\n'+'\n'+'\n')