94d65bc059
* Add broken-links to file This file contains list of broken links removed from readme.md Some links may be temporarily down or moved So that we can investigate on these links and upload working links in future * Automated broken-links collector To implement this file you have to download readme.md(raw file) and broken_link_finder.py and execute them in your machine This program search the entire readme.txt(raw file) and collects the broken-links in error.txt file Some webpage return error due to bot security feature of webpage but works fine when we manually tried in browser Hence a manual check is needed only on those filtered link which is saved in error.txt is needed Thus this program help to narrow down our search to find broken-links * Remove broken-links The are total of 70 links that doesn't work maybe temporarily or permanently Maybe some webpage moved to new pages So I removed them from readme.md and saved them in broken-links.md for future
117 lines
4.5 KiB
Python
117 lines
4.5 KiB
Python
"""
|
|
Note:
|
|
This Program fillters out most of the good links and collect links that returned error save them in separate file
|
|
Some of the links may work in browser but not in python due to security of webpage
|
|
So a manual check on those filtered record is needed
|
|
Since the program already filtered most of the good links we can easily check the reaming link and save time
|
|
This program takes a while depending on internet speed
|
|
|
|
Instruction:
|
|
Download the bad_link_filter and readme as raw file
|
|
Then execute in your machine
|
|
The bad links will be saved in error.txt file
|
|
Then you have to manually check the links mentioned in error.txt file and remove the good links from the file
|
|
|
|
"""
|
|
|
|
def is_url_working(url): #Check the status code of webpage
|
|
import requests
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-us,en;q=0.5',
|
|
'Accept-Encoding': 'gzip,deflate',
|
|
'Connection': 'keep-alive',
|
|
'Access-Control-Allow-Methods': 'POST',
|
|
'Access-Control-Allow-Headers': 'X-PINGOTHER, Content-Type',
|
|
'Pragma': 'no-cache',
|
|
'Cache-Control': 'no-cache',
|
|
}
|
|
proxies = {"http": None,"https": None}
|
|
response=requests.get(url,headers=headers,proxies=proxies)
|
|
status=response.status_code
|
|
if status>=400:
|
|
return status
|
|
except requests.exceptions.ConnectionError as ce:
|
|
return 'HTTPSConnectionPool error'
|
|
except Exception as e:
|
|
return e
|
|
|
|
def func(indexes):
|
|
error_links=[]
|
|
print('InProgress, Sections completed will be shown below.Please wait for a while')
|
|
for index,section in indexes.items():
|
|
for title,row in section.items():
|
|
error=is_url_working(row['link'])
|
|
if error:
|
|
e={
|
|
'index':index,
|
|
'title':title,
|
|
'link': row['link'],
|
|
'error':error
|
|
}
|
|
error_links.append(e)
|
|
print(index,' section completed')
|
|
return error_links
|
|
|
|
def get_lines_from_file(location): #open,read,return lines after filtering empty lines and spaces
|
|
lines=[]
|
|
with open(location,'r') as file:
|
|
lines=[line.strip() for line in file.readlines() if line.strip()]
|
|
return lines
|
|
|
|
def line_to_dict(line): #covert api row to dict
|
|
line=line.strip().split('|')
|
|
name,link=line[1].strip().split('](')
|
|
name,link=name[1:],link[:-1]
|
|
row={
|
|
'link':link,
|
|
'description':line[2],
|
|
'auth':line[3],
|
|
'https':line[4],
|
|
'cors':line[5],
|
|
}
|
|
return name,row
|
|
|
|
def section_to_dict(lines,ind): #convert section to dict
|
|
section={}
|
|
while ind<len(lines):
|
|
if 'Back to Index' in lines[ind]: #Break a section
|
|
break
|
|
name,row=line_to_dict(lines[ind])
|
|
section[name]=row
|
|
ind+=1
|
|
return ind,section
|
|
|
|
def get_section_wise_dict(lines): #convert unstructured lines to section wise dict
|
|
ind=0
|
|
indexes={}
|
|
while ind<len(lines):
|
|
if '###' in lines[ind]: #Enters a section
|
|
name=lines[ind][3:].strip()
|
|
ind,indexes[name]=section_to_dict(lines,ind+1+1+1)
|
|
ind+=1
|
|
return indexes
|
|
|
|
def link_to_error_file(error_links): #Enters the bad links to a file which further requires manual check
|
|
lines=[]
|
|
for row in error_links:
|
|
statement='| {} | [{}]({}) | {} |'.format(row['index'], row['title'], row['link'], str(row['error']))
|
|
lines.append(statement)
|
|
with open('error.txt','w') as file:
|
|
file.write('\n#Manual check has to be done on following links#\n\n')
|
|
file.write('| Section | API |Error/Satus Code |\n')
|
|
file.write('|---|---|---|\n')
|
|
for line in lines:
|
|
file.write(line)
|
|
file.write('\n')
|
|
print("Written to file")
|
|
print('Manual check has to be done for the links saved in error.txt')
|
|
|
|
location= input('Location of readme public api readme file: ') #Get location of raw readme file
|
|
lines=get_lines_from_file(location)
|
|
indexes=get_section_wise_dict(lines)
|
|
error_links=func(indexes)
|
|
link_to_error_file(error_links)
|