diff --git a/.github/broken-link-collectorr.py b/.github/broken-link-collectorr.py new file mode 100644 index 0000000..4ea9a67 --- /dev/null +++ b/.github/broken-link-collectorr.py @@ -0,0 +1,116 @@ +""" +Note: + This Program fillters out most of the good links and collect links that returned error save them in separate file + Some of the links may work in browser but not in python due to security of webpage + So a manual check on those filtered record is needed + Since the program already filtered most of the good links we can easily check the reaming link and save time + This program takes a while depending on internet speed + + Instruction: + Download the bad_link_filter and readme as raw file + Then execute in your machine + The bad links will be saved in error.txt file + Then you have to manually check the links mentioned in error.txt file and remove the good links from the file + +""" + +def is_url_working(url): #Check the status code of webpage + import requests + try: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-us,en;q=0.5', + 'Accept-Encoding': 'gzip,deflate', + 'Connection': 'keep-alive', + 'Access-Control-Allow-Methods': 'POST', + 'Access-Control-Allow-Headers': 'X-PINGOTHER, Content-Type', + 'Pragma': 'no-cache', + 'Cache-Control': 'no-cache', + } + proxies = {"http": None,"https": None} + response=requests.get(url,headers=headers,proxies=proxies) + status=response.status_code + if status>=400: + return status + except requests.exceptions.ConnectionError as ce: + return 'HTTPSConnectionPool error' + except Exception as e: + return e + +def func(indexes): + error_links=[] + print('InProgress, Sections completed will be shown below.Please wait for a while') + for index,section in indexes.items(): + for title,row in section.items(): + error=is_url_working(row['link']) + if error: + e={ + 'index':index, + 'title':title, + 'link': row['link'], + 'error':error + } + error_links.append(e) + print(index,' section completed') + return error_links + +def get_lines_from_file(location): #open,read,return lines after filtering empty lines and spaces + lines=[] + with open(location,'r') as file: + lines=[line.strip() for line in file.readlines() if line.strip()] + return lines + +def line_to_dict(line): #covert api row to dict + line=line.strip().split('|') + name,link=line[1].strip().split('](') + name,link=name[1:],link[:-1] + row={ + 'link':link, + 'description':line[2], + 'auth':line[3], + 'https':line[4], + 'cors':line[5], + } + return name,row + +def section_to_dict(lines,ind): #convert section to dict + section={} + while ind