From 94d65bc059186044a9ba000437fb5828b8a737c1 Mon Sep 17 00:00:00 2001 From: iamjoker021 <84496123+iamjoker021@users.noreply.github.com> Date: Thu, 19 Aug 2021 08:39:02 +0530 Subject: [PATCH] Remove broken links (#16) * Add broken-links to file This file contains list of broken links removed from readme.md Some links may be temporarily down or moved So that we can investigate on these links and upload working links in future * Automated broken-links collector To implement this file you have to download readme.md(raw file) and broken_link_finder.py and execute them in your machine This program search the entire readme.txt(raw file) and collects the broken-links in error.txt file Some webpage return error due to bot security feature of webpage but works fine when we manually tried in browser Hence a manual check is needed only on those filtered link which is saved in error.txt is needed Thus this program help to narrow down our search to find broken-links * Remove broken-links The are total of 70 links that doesn't work maybe temporarily or permanently Maybe some webpage moved to new pages So I removed them from readme.md and saved them in broken-links.md for future --- .github/broken-link-collectorr.py | 116 ++++++++++++++++++++++++++++++ README.md | 70 ------------------ broken-links.md | 78 ++++++++++++++++++++ 3 files changed, 194 insertions(+), 70 deletions(-) create mode 100644 .github/broken-link-collectorr.py create mode 100644 broken-links.md diff --git a/.github/broken-link-collectorr.py b/.github/broken-link-collectorr.py new file mode 100644 index 0000000..4ea9a67 --- /dev/null +++ b/.github/broken-link-collectorr.py @@ -0,0 +1,116 @@ +""" +Note: + This Program fillters out most of the good links and collect links that returned error save them in separate file + Some of the links may work in browser but not in python due to security of webpage + So a manual check on those filtered record is needed + Since the program already filtered most of the good links we can easily check the reaming link and save time + This program takes a while depending on internet speed + + Instruction: + Download the bad_link_filter and readme as raw file + Then execute in your machine + The bad links will be saved in error.txt file + Then you have to manually check the links mentioned in error.txt file and remove the good links from the file + +""" + +def is_url_working(url): #Check the status code of webpage + import requests + try: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-us,en;q=0.5', + 'Accept-Encoding': 'gzip,deflate', + 'Connection': 'keep-alive', + 'Access-Control-Allow-Methods': 'POST', + 'Access-Control-Allow-Headers': 'X-PINGOTHER, Content-Type', + 'Pragma': 'no-cache', + 'Cache-Control': 'no-cache', + } + proxies = {"http": None,"https": None} + response=requests.get(url,headers=headers,proxies=proxies) + status=response.status_code + if status>=400: + return status + except requests.exceptions.ConnectionError as ce: + return 'HTTPSConnectionPool error' + except Exception as e: + return e + +def func(indexes): + error_links=[] + print('InProgress, Sections completed will be shown below.Please wait for a while') + for index,section in indexes.items(): + for title,row in section.items(): + error=is_url_working(row['link']) + if error: + e={ + 'index':index, + 'title':title, + 'link': row['link'], + 'error':error + } + error_links.append(e) + print(index,' section completed') + return error_links + +def get_lines_from_file(location): #open,read,return lines after filtering empty lines and spaces + lines=[] + with open(location,'r') as file: + lines=[line.strip() for line in file.readlines() if line.strip()] + return lines + +def line_to_dict(line): #covert api row to dict + line=line.strip().split('|') + name,link=line[1].strip().split('](') + name,link=name[1:],link[:-1] + row={ + 'link':link, + 'description':line[2], + 'auth':line[3], + 'https':line[4], + 'cors':line[5], + } + return name,row + +def section_to_dict(lines,ind): #convert section to dict + section={} + while ind