Get web source code

Pepo

Hi

How could I use/dev an extension to get the source code (html/plain text) of a website where i'm browsing?

Regards

Olaf

You could use the script I described earlier. The appex extension gets the current web page and stores it in a folder within Pythonista for you to use,, much like Open in Pythonista.

Pepo

Thanks a lot, Olaf. I'm going to check it.

Pepo

It does not work with frames ... any other way?

cook

@Pepo did you mean it doesn't work with an html <iframe>?

Grabbing the http code of a page won't give you the iframe code because that content is from an external source.

If you're after that, I suggest using Beautiful soup (bs4) to find the iframe, get the link, then get the code from that link. You may also need to fix the link....for example, "../stuff/more stuff.html" will need to have the rest of the URL--"http://www.google.com/stuff/morestuff.html"

If there is more than one iframe you may need to select from a list which one you want (if it's not easily identifiable with bs4)

I'm also kind of assuming there is a particular webpage that you want to get the iframe code from. I could be wrong...!

cook

Here's a start:

I've been trying to learn more of bs4 and so this is a it of practice for me too...

# coding: utf-8

import requests
from bs4 import BeautifulSoup

def make_soup(url):
	#in=url out=beautiful soup object
	return BeautifulSoup(requests.get(url).text)
	
def get_iframe_src_html(soup):
	#give beautiful soup object, return html of iframe source
	#this assumes only one iframe:
	src_url = str(soup.find('iframe')['src'])
	#todo: code for more than one iframe in page use case
	
	if 'http' not in src_url:
		#todo: code to attach url root
		pass
	return requests.get(src_url).text
	

#example:
example_page = '<html><body><iframe src="http://www.google.com"></iframe><body>'

soup = BeautifulSoup(example_page)

print get_iframe_src_html(soup)


#example usage:
#soup = make_soup('http://www.google.com')
#iframe_html = get_iframe_src_html(soup)

cvp

If l correctly understand your request, here a script to view and eventually save th Source code of an URL.

# coding: utf-8
import appex
import requests
import os
import ui
import console
import webbrowser
import clipboard
import urllib
import urlparse
import time

def path2url(path):
	w = os.path.join(os.path.expanduser("~/Documents"),path )
	return urlparse.urljoin('file:', urllib.pathname2url(w))
	
class MyView(ui.View):
	def will_close(self):
		pass
		
def oui_action(sender):
	global button_pressed,file_txt,file_html
	os.rename(file_txt,file_html)
	button_pressed = True
		
def non_action(sender):
	global button_pressed,file_txt
	os.remove(file_txt)
	button_pressed = True
		
def main():
	global button_pressed,file_txt,file_html

	console.clear()
	
	if appex.is_running_extension():
		url = appex.get_url()
	else:
		url = clipboard.get()
	
	if url == None:
		console.alert('Nothing in the ClipBoard','','Ok',hide_cancel_button=True)
		return
		
	if url[:7] <> 'http://' and url[:8] <> 'https://':
		console.alert('ClipBoard does not contain a valid URL','','Ok',hide_cancel_button=True)
		return

	# Webview to display conversion site
	x = 0
	y = 0
	w = back.width
	h = back.height - 32 - 2*10
	
	web = ui.WebView(name='web',frame=(x,y,w,h))
	web.border_color = 'blue'
	web.border_width = 1
	back.add_subview(web)	
	
		# Label to display progress
	titlbl = ui.Label(name='titlbl')
	titlbl.width = back.width - 80*2 - 10*4
	titlbl.height = 32
	titlbl.x = 80 + 10*2
	titlbl.y = web.y + web.height + 10
	titlbl.text = ''
	titlbl.alignment = ui.ALIGN_CENTER
	titlbl.font= ('Courier-Bold',20)
	titlbl.text_color = 'black'
	back.add_subview(titlbl)
	
	# Button: yes
	oui_button = ui.Button()
	oui_button.border_color = 'black'
	oui_button.border_width = 1
	oui_button.width = 80
	oui_button.height = 32
	oui_button.x = web.x + web.width - 80 - 10
	oui_button.y = titlbl.y
	oui_button.title = 'yes'
	oui_button.alignment = ui.ALIGN_CENTER
	oui_button.font = ('Courier',20)
	oui_button.text_color = 'black'
	oui_button.hidden = False
	oui_button.action = oui_action
	back.add_subview(oui_button)
	
	# Button: non
	non_button = ui.Button()
	non_button.border_color = 'black'
	non_button.border_width = 1
	non_button.width = 80
	non_button.height = 32
	non_button.x = 10
	non_button.y = titlbl.y
	non_button.title = 'Non'
	non_button.alignment = ui.ALIGN_CENTER
	non_button.font = ('Courier',20)
	non_button.text_color = 'black'
	non_button.hidden = False
	non_button.action = non_action
	back.add_subview(non_button)

	
	# Read page contents
	r = requests.get(url)
	source = r.text
	ct = r.headers['Content-Type']
	extension = '.html' if ct.startswith('text/html') else '.txt' 
	# Where to save the source
	filename='View-OpenPageSource'
	file_txt = os.path.abspath(filename+'.txt')
	file_html = os.path.abspath(filename+'.html')
	# Save the source
	with open(file_txt,'w') as f:
		f.write(source)
	# Display the source
	web.load_url(path2url(file_txt))

	# Ask if source file to be kept	
	titlbl.text = 'Keep the souce file?'
	# loop button not pressed
	button_pressed = False
	while not button_pressed:
		time.sleep(0.5)	
		
	back.close()
		
# Normally called by sharing action in Safari, but could be called by Launcher and passing url via clipboard

# Hide script
back = MyView()
back.background_color='white'
back.name = 'View/Open Page Source'	

if appex.is_running_extension():
	disp_mode = 'sheet'
else:
	disp_mode = 'full_screen'
back.present(disp_mode,hide_title_bar=False)

# check if the script is running instead of 	be imported
if __name__ == '__main__':
	main()
	
if appex.is_running_extension():
	appex.finish()
else:
	# Back to home screen
	webbrowser.open('launcher://crash')

AtomBombed

import urllib2
html_file_url = "" # set your file's url here
download_to = "" # set the name of the file you want to save the html to
open(download_to,"w").write(urllib2.url_open(html_file_url).read())

ccc

@cvp An alternative could be:

if url[:7] <> 'http://' and url[:8] <> 'https://':
# -->
if not url.startswith(('http://', 'https://')):

<> is deprecated in Python2 and removed in Python3. Use != instead.

@AtomBombed It is not recommended to open files without closing them. http://stackoverflow.com/questions/7395542/is-explicitly-closing-files-important

with open(download_to, "w") as out_file:  # will automatically close()
    out_file.write(urllib2.url_open(html_file_url).read())

cvp

@ccc thanks, I always forget this !=...

AtomBombed

@ccc forgot to put that in at the end. I normally do with my own code.

uj_jonas

So I know the question has been answered, but I just wanted to share this. It's just another approach.

import ui, appex
class wvdelegate(object):
	def webview_did_finish_load(self, webview):
		html = webview.eval_js('document.documentElement.innerHTML')
		webview.load_html('<xmp>' + html + r'<\xmp>')
		webview.delegate = None
wv = ui.WebView()
wv.load_url(appex.get_url())
wv.delegate = wvdelegate()
wv.present()

You could also do like this to copy the HTML

import ui, appex, clipboard
class wvdelegate(object):
	def webview_did_finish_load(self, webview):
		self.html = webview.eval_js('document.documentElement.innerHTML')
		webview.load_html('<xmp>' + self.html + r'<\xmp>')
		webview.delegate.webview_did_finish_load = None
		wv.right_button_items = [ui.ButtonItem(image=ui.Image('iob:clipboard_32'), action=lambda x: clipboard.set(wv.delegate.html))]
wv = ui.WebView()
wv.load_url(appex.get_url())
wv.delegate = wvdelegate()
wv.present()