Welcome!
This is the community forum for my apps Pythonista and Editorial.
For individual support questions, you can also send an email. If you have a very short question or just want to say hello — I'm @olemoritz on Twitter.
Get web source code
-
Hi
How could I use/dev an extension to get the source code (html/plain text) of a website where i'm browsing?
Regards
-
You could use the script I described earlier. The appex extension gets the current web page and stores it in a folder within Pythonista for you to use,, much like
Open in Pythonista
. -
Thanks a lot, Olaf. I'm going to check it.
-
It does not work with frames ... any other way?
-
@Pepo did you mean it doesn't work with an html <iframe>?
Grabbing the http code of a page won't give you the iframe code because that content is from an external source.
If you're after that, I suggest using Beautiful soup (bs4) to find the iframe, get the link, then get the code from that link. You may also need to fix the link....for example, "../stuff/more stuff.html" will need to have the rest of the URL--"http://www.google.com/stuff/morestuff.html"
If there is more than one iframe you may need to select from a list which one you want (if it's not easily identifiable with bs4)
I'm also kind of assuming there is a particular webpage that you want to get the iframe code from. I could be wrong...!
-
Here's a start:
I've been trying to learn more of bs4 and so this is a it of practice for me too...
# coding: utf-8 import requests from bs4 import BeautifulSoup def make_soup(url): #in=url out=beautiful soup object return BeautifulSoup(requests.get(url).text) def get_iframe_src_html(soup): #give beautiful soup object, return html of iframe source #this assumes only one iframe: src_url = str(soup.find('iframe')['src']) #todo: code for more than one iframe in page use case if 'http' not in src_url: #todo: code to attach url root pass return requests.get(src_url).text #example: example_page = '<html><body><iframe src="http://www.google.com"></iframe><body>' soup = BeautifulSoup(example_page) print get_iframe_src_html(soup) #example usage: #soup = make_soup('http://www.google.com') #iframe_html = get_iframe_src_html(soup)
-
If l correctly understand your request, here a script to view and eventually save th Source code of an URL.
# coding: utf-8 import appex import requests import os import ui import console import webbrowser import clipboard import urllib import urlparse import time def path2url(path): w = os.path.join(os.path.expanduser("~/Documents"),path ) return urlparse.urljoin('file:', urllib.pathname2url(w)) class MyView(ui.View): def will_close(self): pass def oui_action(sender): global button_pressed,file_txt,file_html os.rename(file_txt,file_html) button_pressed = True def non_action(sender): global button_pressed,file_txt os.remove(file_txt) button_pressed = True def main(): global button_pressed,file_txt,file_html console.clear() if appex.is_running_extension(): url = appex.get_url() else: url = clipboard.get() if url == None: console.alert('Nothing in the ClipBoard','','Ok',hide_cancel_button=True) return if url[:7] <> 'http://' and url[:8] <> 'https://': console.alert('ClipBoard does not contain a valid URL','','Ok',hide_cancel_button=True) return # Webview to display conversion site x = 0 y = 0 w = back.width h = back.height - 32 - 2*10 web = ui.WebView(name='web',frame=(x,y,w,h)) web.border_color = 'blue' web.border_width = 1 back.add_subview(web) # Label to display progress titlbl = ui.Label(name='titlbl') titlbl.width = back.width - 80*2 - 10*4 titlbl.height = 32 titlbl.x = 80 + 10*2 titlbl.y = web.y + web.height + 10 titlbl.text = '' titlbl.alignment = ui.ALIGN_CENTER titlbl.font= ('Courier-Bold',20) titlbl.text_color = 'black' back.add_subview(titlbl) # Button: yes oui_button = ui.Button() oui_button.border_color = 'black' oui_button.border_width = 1 oui_button.width = 80 oui_button.height = 32 oui_button.x = web.x + web.width - 80 - 10 oui_button.y = titlbl.y oui_button.title = 'yes' oui_button.alignment = ui.ALIGN_CENTER oui_button.font = ('Courier',20) oui_button.text_color = 'black' oui_button.hidden = False oui_button.action = oui_action back.add_subview(oui_button) # Button: non non_button = ui.Button() non_button.border_color = 'black' non_button.border_width = 1 non_button.width = 80 non_button.height = 32 non_button.x = 10 non_button.y = titlbl.y non_button.title = 'Non' non_button.alignment = ui.ALIGN_CENTER non_button.font = ('Courier',20) non_button.text_color = 'black' non_button.hidden = False non_button.action = non_action back.add_subview(non_button) # Read page contents r = requests.get(url) source = r.text ct = r.headers['Content-Type'] extension = '.html' if ct.startswith('text/html') else '.txt' # Where to save the source filename='View-OpenPageSource' file_txt = os.path.abspath(filename+'.txt') file_html = os.path.abspath(filename+'.html') # Save the source with open(file_txt,'w') as f: f.write(source) # Display the source web.load_url(path2url(file_txt)) # Ask if source file to be kept titlbl.text = 'Keep the souce file?' # loop button not pressed button_pressed = False while not button_pressed: time.sleep(0.5) back.close() # Normally called by sharing action in Safari, but could be called by Launcher and passing url via clipboard # Hide script back = MyView() back.background_color='white' back.name = 'View/Open Page Source' if appex.is_running_extension(): disp_mode = 'sheet' else: disp_mode = 'full_screen' back.present(disp_mode,hide_title_bar=False) # check if the script is running instead of be imported if __name__ == '__main__': main() if appex.is_running_extension(): appex.finish() else: # Back to home screen webbrowser.open('launcher://crash')
-
import urllib2 html_file_url = "" # set your file's url here download_to = "" # set the name of the file you want to save the html to open(download_to,"w").write(urllib2.url_open(html_file_url).read())
-
@cvp An alternative could be:
if url[:7] <> 'http://' and url[:8] <> 'https://': # --> if not url.startswith(('http://', 'https://')):
<>
is deprecated in Python2 and removed in Python3. Use!=
instead.@AtomBombed It is not recommended to open files without closing them. http://stackoverflow.com/questions/7395542/is-explicitly-closing-files-important
with open(download_to, "w") as out_file: # will automatically close() out_file.write(urllib2.url_open(html_file_url).read())
-
@ccc thanks, I always forget this !=...
-
@ccc forgot to put that in at the end. I normally do with my own code.
-
So I know the question has been answered, but I just wanted to share this. It's just another approach.
import ui, appex class wvdelegate(object): def webview_did_finish_load(self, webview): html = webview.eval_js('document.documentElement.innerHTML') webview.load_html('<xmp>' + html + r'<\xmp>') webview.delegate = None wv = ui.WebView() wv.load_url(appex.get_url()) wv.delegate = wvdelegate() wv.present()
You could also do like this to copy the HTML
import ui, appex, clipboard class wvdelegate(object): def webview_did_finish_load(self, webview): self.html = webview.eval_js('document.documentElement.innerHTML') webview.load_html('<xmp>' + self.html + r'<\xmp>') webview.delegate.webview_did_finish_load = None wv.right_button_items = [ui.ButtonItem(image=ui.Image('iob:clipboard_32'), action=lambda x: clipboard.set(wv.delegate.html))] wv = ui.WebView() wv.load_url(appex.get_url()) wv.delegate = wvdelegate() wv.present()