| | |
| | """WarOnlineForum.ipynb""" |
| |
|
| | |
| |
|
| | import requests |
| | from bs4 import BeautifulSoup |
| | import re |
| | import pandas as pd |
| | import urllib.request as urllib |
| | import warnings |
| | warnings.filterwarnings("ignore") |
| |
|
| | |
| | corpus = pd.DataFrame(columns=['Quote', 'Response']) |
| |
|
| | def remove_substring(string, substring): |
| | index = string.find(substring) |
| | if index != -1: |
| | start_index = string.rfind(" ", 0, index) + 1 |
| | end_index = string.find(" ", index) |
| | if end_index == -1: |
| | end_index = len(string) |
| | return string[:start_index] + string[end_index:] |
| | return string |
| |
|
| | def remove_attachments(string, substring='Посмотреть вложение'): |
| | index = string.find(substring) |
| | if index != -1: |
| | end_index = string.find(" ", index) |
| | if end_index == -1: |
| | end_index = len(string) |
| | return string[:index] + string[end_index:] |
| | return string |
| |
|
| | def collectDataFromPage(url): |
| | |
| |
|
| | |
| | response = requests.get(url) |
| | html = response.content |
| |
|
| | |
| | soup = BeautifulSoup(response.content, "html.parser") |
| |
|
| | |
| | message_contents = soup.find_all("div", class_="bbWrapper") |
| |
|
| | |
| | for message_content in message_contents: |
| | |
| | message_text = message_content.text.strip() |
| | |
| | |
| | try: |
| | quoted_text = message_content.find("blockquote").text.strip() |
| | quoted_text = ''.join(BeautifulSoup(quoted_text, "html.parser").findAll(string=True)) |
| | quoted_text = quoted_text.replace('Нажмите для раскрытия...', '') |
| | message_text = message_text.replace('Нажмите для раскрытия...', '') |
| | |
| | |
| | |
| | |
| | Quote = re.sub(r'http\S+', '', ' '.join(quoted_text.split()).partition('(а): ')[2]) |
| | Quote = remove_substring(Quote,".com") |
| | Quote = remove_attachments(Quote) |
| | Quote = ' '.join(remove_substring(Quote,"@").split()) |
| | |
| | Message = ' '.join(message_text.replace(quoted_text,'').split()) |
| | Message = remove_substring(Message,".com") |
| | Message = remove_attachments(Message) |
| | Message = ' '.join(remove_substring(Message,"@").split()) |
| |
|
| | if Message and Quote: |
| | |
| | corpus.loc[len(corpus)]=[Quote,Message] |
| | |
| | |
| | |
| | except: |
| | pass |
| |
|
| | def compare_pages(url1, url2): |
| | page1 = requests.get(url1).text |
| | page2 = requests.get(url2).text |
| | |
| | return len(page1) == len(page2) |
| |
|
| | def compare_pages2(url1, url2): |
| | return urllib.urlopen(url1).geturl() == urllib.urlopen(url2).geturl() |
| |
|
| |
|
| | def pages_of_thread(thread,startingPage=1): |
| | page = startingPage |
| | lastPage = False |
| | while not lastPage: |
| | response = requests.get(thread+'/page-'+str(page)) |
| | if response.status_code == 200: |
| | collectDataFromPage(url = thread+'/page-'+str(page)) |
| | print(f'finished page #{page}') |
| | if not compare_pages2(thread+'/page-'+str(page),thread+'/page-'+str(page+1)): |
| | page+=1 |
| | else: |
| | lastPage = True |
| | else: |
| | lastPage = True |
| |
|
| | |
| | |
| |
|
| | """______________________________________ Main Code __________________________________________""" |
| |
|
| | |
| | base_url = 'https://waronline.org' |
| | |
| | |
| | |
| | |
| | |
| | url = "https://waronline.org/fora/index.php?forums/%D0%92%D0%9C%D0%A4-%D0%B3%D1%80%D0%B0%D0%B6%D0%B4%D0%B0%D0%BD%D1%81%D0%BA%D0%B8%D0%B9-%D1%84%D0%BB%D0%BE%D1%82.12/" |
| |
|
| | base_page = 1 |
| | lastSubForumPage = False |
| |
|
| | while not lastSubForumPage: |
| |
|
| | |
| | response = requests.get(url+'page-'+str(base_page)) |
| | forum_threads = [] |
| |
|
| | |
| | if response.status_code == 200: |
| | |
| | soup = BeautifulSoup(response.content, "html.parser") |
| | |
| | |
| | links = soup.find_all("a") |
| | |
| | |
| | for link in links: |
| | lnk = link.get("href") |
| | if lnk: |
| | if 'threads' in lnk: |
| | forum_threads.append((base_url+lnk).rsplit("/", 1)[0]) |
| |
|
| | |
| | forum_threads = list(set(forum_threads)) |
| | |
| | for trd in forum_threads: |
| | pages_of_thread(trd) |
| | print(f'finished thread: {trd}') |
| |
|
| | if not compare_pages2(url+'page-'+str(base_page),url+'page-'+str(base_page+1)): |
| | print(f'finished subforum page #{base_page}') |
| | base_page+=1 |
| | else: |
| | lastSubForumPage = True |
| |
|
| | else: |
| | print("Failed to load the page") |
| | lastSubForumPage = True |
| |
|
| | |
| | corpus['Quote'] = corpus['Quote'].apply(lambda x: x.lower() if isinstance(x,str) else x) |
| | corpus['Response'] = corpus['Response'].apply(lambda x: x.lower() if isinstance(x,str) else x) |
| |
|
| | |
| | corpus.Quote.str.replace('[^a-zA-Z]', '') |
| | corpus.Response.str.replace('[^a-zA-Z]', '') |
| |
|
| | |
| | pathToDrive = '' |
| | filename = 'part5.csv' |
| | corpus.to_csv(pathToDrive+filename,index=False) |