Package lxml :: Package html
[frames] | no frames]

Source Code for Package lxml.html

   1  """The ``lxml.html`` tool set for HTML handling. 
   2  """ 
   3   
   4  import threading 
   5  import re 
   6  import urlparse 
   7  import copy 
   8  from lxml import etree 
   9  from lxml.html import defs 
  10  from lxml import cssselect 
  11  from lxml.html._setmixin import SetMixin 
  12  try: 
  13      from UserDict import DictMixin 
  14  except ImportError: 
  15      # DictMixin was introduced in Python 2.4 
  16      from lxml.html._dictmixin import DictMixin 
  17  import sets 
  18   
  19  __all__ = [ 
  20      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  21      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  22      'find_rel_links', 'find_class', 'make_links_absolute', 
  23      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 
  24   
  25  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") 
  26  #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 
  27  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  28  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  29  _collect_string_content = etree.XPath("string()") 
  30  _css_url_re = re.compile(r'url\((.*?)\)', re.I) 
  31  _css_import_re = re.compile(r'@import "(.*?)"') 
  32  _label_xpath = etree.XPath("//label[@for=$id]") 
  33  _archive_re = re.compile(r'[^ ]+') 
  34   
35 -class HtmlMixin(object):
36
37 - def base_url(self):
38 """ 39 Returns the base URL, given when the page was parsed. 40 41 Use with ``urlparse.urljoin(el.base_url, href)`` to get 42 absolute URLs. 43 """ 44 return self.getroottree().docinfo.URL
45 base_url = property(base_url, doc=base_url.__doc__) 46
47 - def forms(self):
48 """ 49 Return a list of all the forms 50 """ 51 return list(self.getiterator('form'))
52 forms = property(forms, doc=forms.__doc__) 53
54 - def body(self):
55 """ 56 Return the <body> element. Can be called from a child element 57 to get the document's head. 58 """ 59 return self.xpath('//body')[0]
60 body = property(body, doc=body.__doc__) 61
62 - def head(self):
63 """ 64 Returns the <head> element. Can be called from a child 65 element to get the document's head. 66 """ 67 return self.xpath('//head')[0]
68 head = property(head, doc=head.__doc__) 69
70 - def _label__get(self):
71 """ 72 Get or set any <label> element associated with this element. 73 """ 74 id = self.get('id') 75 if not id: 76 return None 77 result = _label_xpath(self, id=id) 78 if not result: 79 return None 80 else: 81 return result[0]
82 - def _label__set(self, label):
83 id = self.get('id') 84 if not id: 85 raise TypeError( 86 "You cannot set a label for an element (%r) that has no id" 87 % self) 88 if not label.tag == 'label': 89 raise TypeError( 90 "You can only assign label to a label element (not %r)" 91 % label) 92 label.set('for', id)
93 - def _label__del(self):
94 label = self.label 95 if label is not None: 96 del label.attrib['for']
97 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 98
99 - def drop_tree(self):
100 """ 101 Removes this element from the tree, including its children and 102 text. The tail text is joined to the previous element or 103 parent. 104 """ 105 parent = self.getparent() 106 assert parent is not None 107 if self.tail: 108 previous = self.getprevious() 109 if previous is None: 110 parent.text = (parent.text or '') + self.tail 111 else: 112 previous.tail = (previous.tail or '') + self.tail 113 parent.remove(self)
114
115 - def drop_tag(self):
116 """ 117 Remove the tag, but not its children or text. The children and text 118 are merged into the parent. 119 120 Example:: 121 122 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 123 >>> h.find('.//b').drop_tag() 124 >>> print tostring(h) 125 <div>Hello World!</div> 126 """ 127 parent = self.getparent() 128 assert parent is not None 129 previous = self.getprevious() 130 if self.text and isinstance(self.tag, basestring): 131 # not a Comment, etc. 132 if previous is None: 133 parent.text = (parent.text or '') + self.text 134 else: 135 previous.tail = (previous.tail or '') + self.text 136 if self.tail: 137 if len(self): 138 last = self[-1] 139 last.tail = (last.tail or '') + self.tail 140 elif previous is None: 141 parent.text = (parent.text or '') + self.tail 142 else: 143 previous.tail = (previous.tail or '') + self.tail 144 index = parent.index(self) 145 parent[index:index+1] = self[:]
146 154
155 - def find_class(self, class_name):
156 """ 157 Find any elements with the given class name. 158 """ 159 return _class_xpath(self, class_name=class_name)
160
161 - def get_element_by_id(self, id, *default):
162 """ 163 Get the first element in a document with the given id. If none is 164 found, return the default argument if provided or raise KeyError 165 otherwise. 166 167 Note that there can be more than one element with the same id, 168 and this isn't uncommon in HTML documents found in the wild. 169 Browsers return only the first match, and this function does 170 the same. 171 """ 172 try: 173 # FIXME: should this check for multiple matches? 174 # browsers just return the first one 175 return _id_xpath(self, id=id)[0] 176 except IndexError: 177 if default: 178 return default[0] 179 else: 180 raise KeyError, id
181
182 - def text_content(self):
183 """ 184 Return the text content of the tag (and the text in any children). 185 """ 186 return _collect_string_content(self)
187
188 - def cssselect(self, expr):
189 """ 190 Run the CSS expression on this element and its children, 191 returning a list of the results. 192 193 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 194 that pre-compiling the expression can provide a substantial 195 speedup. 196 """ 197 return cssselect.CSSSelector(expr)(self)
198 199 ######################################## 200 ## Link functions 201 ######################################## 202 222 self.rewrite_links(link_repl)
223
224 - def resolve_base_href(self):
225 """ 226 Find any ``<base href>`` tag in the document, and apply its 227 values to all links found in the document. Also remove the 228 tag once it has been applied. 229 """ 230 base_href = None 231 basetags = self.xpath('//base[@href]') 232 for b in basetags: 233 base_href = b.get('href') 234 b.drop_tree() 235 if not base_href: 236 return 237 self.make_links_absolute(base_href, resolve_base_href=False)
238 293 340 341
342 -class _MethodFunc(object):
343 """ 344 An object that represents a method on an element as a function; 345 the function takes either an element or an HTML string. It 346 returns whatever the function normally returns, or if the function 347 works in-place (and so returns None) it returns a serialized form 348 of the resulting document. 349 """
350 - def __init__(self, name, copy=False, source_class=HtmlMixin):
351 self.name = name 352 self.copy = copy 353 self.__doc__ = getattr(source_class, self.name).__doc__
354 - def __call__(self, doc, *args, **kw):
355 if isinstance(doc, basestring): 356 if 'copy' in kw: 357 raise TypeError( 358 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 359 return_string = True 360 doc = fromstring(doc, **kw) 361 else: 362 if 'copy' in kw: 363 copy = kw.pop('copy') 364 else: 365 copy = self.copy 366 return_string = False 367 if copy: 368 doc = copy.deepcopy(doc) 369 meth = getattr(doc, self.name) 370 result = meth(*args, **kw) 371 # FIXME: this None test is a bit sloppy 372 if result is None: 373 # Then return what we got in 374 if return_string: 375 return tostring(doc) 376 else: 377 return doc 378 else: 379 return result
380 381 find_rel_links = _MethodFunc('find_rel_links', copy=False) 382 find_class = _MethodFunc('find_class', copy=False) 383 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 384 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 385 iterlinks = _MethodFunc('iterlinks', copy=False) 386 rewrite_links = _MethodFunc('rewrite_links', copy=True) 387
388 -class HtmlComment(etree.CommentBase, HtmlMixin):
389 pass
390
391 -class HtmlElement(etree.ElementBase, HtmlMixin):
392 pass
393
394 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
395 pass
396
397 -class HtmlEntity(etree.EntityBase, HtmlMixin):
398 pass
399 400
401 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
402 """A lookup scheme for HTML Element classes. 403 404 To create a lookup instance with different Element classes, pass a tag 405 name mapping of Element classes in the ``classes`` keyword argument and/or 406 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 407 The special key '*' denotes a Mixin class that should be mixed into all 408 Element classes. 409 """ 410 _default_element_classes = {} 411
412 - def __init__(self, classes=None, mixins=None):
413 etree.CustomElementClassLookup.__init__(self) 414 if classes is None: 415 classes = self._default_element_classes.copy() 416 if mixins: 417 mixers = {} 418 for name, value in mixins: 419 if name == '*': 420 for n in classes.keys(): 421 mixers.setdefault(n, []).append(value) 422 else: 423 mixers.setdefault(name, []).append(value) 424 for name, mix_bases in mixers.items(): 425 cur = classes.get(name, HtmlElement) 426 bases = tuple(mix_bases + [cur]) 427 classes[name] = type(cur.__name__, bases, {}) 428 self._element_classes = classes
429
430 - def lookup(self, node_type, document, namespace, name):
431 if node_type == 'element': 432 return self._element_classes.get(name.lower(), HtmlElement) 433 elif node_type == 'comment': 434 return HtmlComment 435 elif node_type == 'PI': 436 return HtmlProcessingInstruction 437 elif node_type == 'entity': 438 return HtmlEntity 439 # Otherwise normal lookup 440 return None
441 442 ################################################################################ 443 # parsing 444 ################################################################################ 445
446 -def document_fromstring(html, parser=None, **kw):
447 if parser is None: 448 parser = html_parser 449 value = etree.fromstring(html, parser, **kw) 450 if value is None: 451 raise etree.ParserError( 452 "Document is empty") 453 return value
454
455 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 456 parser=None, **kw):
457 """ 458 Parses several HTML elements, returning a list of elements. 459 460 The first item in the list may be a string (though leading 461 whitespace is removed). If no_leading_text is true, then it will 462 be an error if there is leading text, and it will always be a list 463 of only elements. 464 465 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 466 """ 467 if parser is None: 468 parser = html_parser 469 # FIXME: check what happens when you give html with a body, head, etc. 470 start = html[:20].lstrip().lower() 471 if not start.startswith('<html') and not start.startswith('<!doctype'): 472 html = '<html><body>%s</body></html>' % html 473 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 474 assert doc.tag == 'html' 475 bodies = [e for e in doc if e.tag == 'body'] 476 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 477 body = bodies[0] 478 elements = [] 479 if no_leading_text and body.text and body.text.strip(): 480 raise etree.ParserError( 481 "There is leading text: %r" % body.text) 482 if body.text and body.text.strip(): 483 elements.append(body.text) 484 elements.extend(body) 485 # FIXME: removing the reference to the parent artificial document 486 # would be nice 487 return elements
488
489 -def fragment_fromstring(html, create_parent=False, base_url=None, 490 parser=None, **kw):
491 """ 492 Parses a single HTML element; it is an error if there is more than 493 one element, or if anything but whitespace precedes or follows the 494 element. 495 496 If create_parent is true (or is a tag name) then a parent node 497 will be created to encapsulate the HTML in a single element. 498 499 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 500 """ 501 if parser is None: 502 parser = html_parser 503 if create_parent: 504 if not isinstance(create_parent, basestring): 505 create_parent = 'div' 506 return fragment_fromstring('<%s>%s</%s>' % ( 507 create_parent, html, create_parent), 508 parser=parser, base_url=base_url, **kw) 509 elements = fragments_fromstring(html, parser=parser, no_leading_text=True, 510 base_url=base_url, **kw) 511 if not elements: 512 raise etree.ParserError( 513 "No elements found") 514 if len(elements) > 1: 515 raise etree.ParserError( 516 "Multiple elements found (%s)" 517 % ', '.join([_element_name(e) for e in elements])) 518 el = elements[0] 519 if el.tail and el.tail.strip(): 520 raise etree.ParserError( 521 "Element followed by text: %r" % el.tail) 522 el.tail = None 523 return el
524
525 -def fromstring(html, base_url=None, parser=None, **kw):
526 """ 527 Parse the html, returning a single element/document. 528 529 This tries to minimally parse the chunk of text, without knowing if it 530 is a fragment or a document. 531 532 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 533 """ 534 if parser is None: 535 parser = html_parser 536 start = html[:10].lstrip().lower() 537 if start.startswith('<html') or start.startswith('<!doctype'): 538 # Looks like a full HTML document 539 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 540 # otherwise, lets parse it out... 541 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 542 bodies = doc.findall('body') 543 if bodies: 544 body = bodies[0] 545 if len(bodies) > 1: 546 # Somehow there are multiple bodies, which is bad, but just 547 # smash them into one body 548 for other_body in bodies[1:]: 549 if other_body.text: 550 if len(body): 551 body[-1].tail = (body[-1].tail or '') + other_body.text 552 else: 553 body.text = (body.text or '') + other_body.text 554 body.extend(other_body) 555 # We'll ignore tail 556 # I guess we are ignoring attributes too 557 other_body.drop_tree() 558 else: 559 body = None 560 heads = doc.findall('head') 561 if heads: 562 # Well, we have some sort of structure, so lets keep it all 563 head = heads[0] 564 if len(heads) > 1: 565 for other_head in heads[1:]: 566 head.extend(other_head) 567 # We don't care about text or tail in a head 568 other_head.drop_tree() 569 return doc 570 if (len(body) == 1 and (not body.text or not body.text.strip()) 571 and (not body[-1].tail or not body[-1].tail.strip())): 572 # The body has just one element, so it was probably a single 573 # element passed in 574 return body[0] 575 # Now we have a body which represents a bunch of tags which have the 576 # content that was passed in. We will create a fake container, which 577 # is the body tag, except <body> implies too much structure. 578 if _contains_block_level_tag(body): 579 body.tag = 'div' 580 else: 581 body.tag = 'span' 582 return body
583
584 -def parse(filename_or_url, parser=None, base_url=None, **kw):
585 """ 586 Parse a filename, URL, or file-like object into an HTML document 587 tree. Note: this returns a tree, not an element. Use 588 ``parse(...).getroot()`` to get the document root. 589 590 You can override the base URL with the ``base_url`` keyword. This 591 is most useful when parsing from a file-like object. 592 """ 593 if parser is None: 594 parser = html_parser 595 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
596
597 -def _contains_block_level_tag(el):
598 # FIXME: I could do this with XPath, but would that just be 599 # unnecessarily slow? 600 for el in el.getiterator(): 601 if el.tag in defs.block_tags: 602 return True 603 return False
604
605 -def _element_name(el):
606 if isinstance(el, etree.CommentBase): 607 return 'comment' 608 elif isinstance(el, basestring): 609 return 'string' 610 else: 611 return el.tag
612 613 ################################################################################ 614 # form handling 615 ################################################################################ 616
617 -class FormElement(HtmlElement):
618 """ 619 Represents a <form> element. 620 """ 621
622 - def inputs(self):
623 """ 624 Returns an accessor for all the input elements in the form. 625 626 See `InputGetter` for more information about the object. 627 """ 628 return InputGetter(self)
629 inputs = property(inputs, doc=inputs.__doc__) 630
631 - def _fields__get(self):
632 """ 633 Dictionary-like object that represents all the fields in this 634 form. You can set values in this dictionary to effect the 635 form. 636 """ 637 return FieldsDict(self.inputs)
638 - def _fields__set(self, value):
639 prev_keys = self.fields.keys() 640 for key, value in value.iteritems(): 641 if key in prev_keys: 642 prev_keys.remove(key) 643 self.fields[key] = value 644 for key in prev_keys: 645 if key is None: 646 # Case of an unnamed input; these aren't really 647 # expressed in form_values() anyway. 648 continue 649 self.fields[key] = None
650 651 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 652
653 - def _name(self):
654 if self.get('name'): 655 return self.get('name') 656 elif self.get('id'): 657 return '#' + self.get('id') 658 return str(self.body.findall('form').index(self))
659
660 - def form_values(self):
661 """ 662 Return a list of tuples of the field values for the form. 663 This is suitable to be passed to ``urllib.urlencode()``. 664 """ 665 results = [] 666 for el in self.inputs: 667 name = el.name 668 if not name: 669 continue 670 if el.tag == 'textarea': 671 results.append((name, el.value)) 672 elif el.tag == 'select': 673 value = el.value 674 if el.multiple: 675 for v in value: 676 results.append((name, v)) 677 elif value is not None: 678 results.append((name, el.value)) 679 else: 680 assert el.tag == 'input', ( 681 "Unexpected tag: %r" % el) 682 if el.checkable and not el.checked: 683 continue 684 if el.type in ('submit', 'image', 'reset'): 685 continue 686 value = el.value 687 if value is not None: 688 results.append((name, el.value)) 689 return results
690
691 - def _action__get(self):
692 """ 693 Get/set the form's ``action`` attribute. 694 """ 695 base_url = self.base_url 696 action = self.get('action') 697 if base_url and action is not None: 698 return urlparse.urljoin(base_url, action) 699 else: 700 return action
701 - def _action__set(self, value):
702 self.set('action', value)
703 - def _action__del(self):
704 if 'action' in self.attrib: 705 del self.attrib['action']
706 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 707
708 - def _method__get(self):
709 """ 710 Get/set the form's method. Always returns a capitalized 711 string, and defaults to ``'GET'`` 712 """ 713 return self.get('method', 'GET').upper()
714 - def _method__set(self, value):
715 self.set('method', value.upper())
716 method = property(_method__get, _method__set, doc=_method__get.__doc__)
717 718 HtmlElementClassLookup._default_element_classes['form'] = FormElement 719
720 -def submit_form(form, extra_values=None, open_http=None):
721 """ 722 Helper function to submit a form. Returns a file-like object, as from 723 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 724 which shows the URL if there were any redirects. 725 726 You can use this like:: 727 728 form = doc.forms[0] 729 form.inputs['foo'].value = 'bar' # etc 730 response = form.submit() 731 doc = parse(response) 732 doc.make_links_absolute(response.geturl()) 733 734 To change the HTTP requester, pass a function as ``open_http`` keyword 735 argument that opens the URL for you. The function must have the following 736 signature:: 737 738 open_http(method, URL, values) 739 740 The action is one of 'GET' or 'POST', the URL is the target URL as a 741 string, and the values are a sequence of ``(name, value)`` tuples with the 742 form data. 743 """ 744 values = form.form_values() 745 if extra_values: 746 if hasattr(extra_values, 'items'): 747 extra_values = extra_values.items() 748 values.extend(extra_values) 749 if open_http is None: 750 open_http = open_http_urllib 751 return open_http(form.method, form.action, values)
752
753 -def open_http_urllib(method, url, values):
754 import urllib 755 ## FIXME: should test that it's not a relative URL or something 756 if method == 'GET': 757 if '?' in url: 758 url += '&' 759 else: 760 url += '?' 761 url += urllib.urlencode(values) 762 data = None 763 else: 764 data = urllib.urlencode(values) 765 return urllib.urlopen(url, data)
766
767 -class FieldsDict(DictMixin):
768
769 - def __init__(self, inputs):
770 self.inputs = inputs
771 - def __getitem__(self, item):
772 return self.inputs[item].value
773 - def __setitem__(self, item, value):
774 self.inputs[item].value = value
775 - def __delitem__(self, item):
776 raise KeyError( 777 "You cannot remove keys from ElementDict")
778 - def keys(self):
779 return self.inputs.keys()
780 - def __contains__(self, item):
781 return item in self.inputs
782
783 - def __repr__(self):
784 return '<%s for form %s>' % ( 785 self.__class__.__name__, 786 self.inputs.form._name())
787
788 -class InputGetter(object):
789 790 """ 791 An accessor that represents all the input fields in a form. 792 793 You can get fields by name from this, with 794 ``form.inputs['field_name']``. If there are a set of checkboxes 795 with the same name, they are returned as a list (a `CheckboxGroup` 796 which also allows value setting). Radio inputs are handled 797 similarly. 798 799 You can also iterate over this to get all input elements. This 800 won't return the same thing as if you get all the names, as 801 checkboxes and radio elements are returned individually. 802 """ 803 804 _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]") 805 _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']") 806
807 - def __init__(self, form):
808 self.form = form
809
810 - def __repr__(self):
811 return '<%s for form %s>' % ( 812 self.__class__.__name__, 813 self.form._name())
814 815 ## FIXME: there should be more methods, and it's unclear if this is 816 ## a dictionary-like object or list-like object 817
818 - def __getitem__(self, name):
819 results = self._name_xpath(self.form, name=name) 820 if results: 821 type = results[0].get('type') 822 if type == 'radio' and len(results) > 1: 823 group = RadioGroup(results) 824 group.name = name 825 return group 826 elif type == 'checkbox' and len(results) > 1: 827 group = CheckboxGroup(results) 828 group.name = name 829 return group 830 else: 831 # I don't like throwing away elements like this 832 return results[0] 833 else: 834 raise KeyError( 835 "No input element with the name %r" % name)
836
837 - def __contains__(self, name):
838 results = self._name_xpath(self.form, name=name) 839 return bool(results)
840
841 - def keys(self):
842 names = sets.Set() 843 for el in self: 844 if el.name is not None: 845 names.add(el.name) 846 return list(names)
847
848 - def __iter__(self):
849 ## FIXME: kind of dumb to turn a list into an iterator, only 850 ## to have it likely turned back into a list again :( 851 return iter(self._all_xpath(self.form))
852
853 -class InputMixin(object):
854 855 """ 856 Mix-in for all input elements (input, select, and textarea) 857 """ 858 859
860 - def _name__get(self):
861 """ 862 Get/set the name of the element 863 """ 864 return self.get('name')
865 - def _name__set(self, value):
866 self.set('name', value)
867 - def _name__del(self):
868 if 'name' in self.attrib: 869 del self.attrib['name']
870 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 871
872 - def __repr__(self):
873 type = getattr(self, 'type', None) 874 if type: 875 type = ' type=%r' % type 876 else: 877 type = '' 878 return '<%s %x name=%r%s>' % ( 879 self.__class__.__name__, id(self), self.name, type)
880
881 -class TextareaElement(InputMixin, HtmlElement):
882 """ 883 ``<textarea>`` element. You can get the name with ``.name`` and 884 get/set the value with ``.value`` 885 """ 886
887 - def _value__get(self):
888 """ 889 Get/set the value (which is the contents of this element) 890 """ 891 return self.text or ''
892 - def _value__set(self, value):
893 self.text = value
894 - def _value__del(self):
895 self.text = ''
896 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
897 898 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 899
900 -class SelectElement(InputMixin, HtmlElement):
901 """ 902 ``<select>`` element. You can get the name with ``.name``. 903 904 ``.value`` will be the value of the selected option, unless this 905 is a multi-select element (``<select multiple>``), in which case 906 it will be a set-like object. In either case ``.value_options`` 907 gives the possible values. 908 909 The boolean attribute ``.multiple`` shows if this is a 910 multi-select. 911 """ 912
913 - def _value__get(self):
914 """ 915 Get/set the value of this select (the selected option). 916 917 If this is a multi-select, this is a set-like object that 918 represents all the selected options. 919 """ 920 if self.multiple: 921 return MultipleSelectOptions(self) 922 for el in self.getiterator('option'): 923 if 'selected' in el.attrib: 924 value = el.get('value') 925 # FIXME: If value is None, what to return?, get_text()? 926 return value 927 return None
928
929 - def _value__set(self, value):
930 if self.multiple: 931 if isinstance(value, basestring): 932 raise TypeError( 933 "You must pass in a sequence") 934 self.value.clear() 935 self.value.update(value) 936 return 937 if value is not None: 938 for el in self.getiterator('option'): 939 # FIXME: also if el.get('value') is None? 940 if el.get('value') == value: 941 checked_option = el 942 break 943 else: 944 raise ValueError( 945 "There is no option with the value of %r" % value) 946 for el in self.getiterator('option'): 947 if 'selected' in el.attrib: 948 del el.attrib['selected'] 949 if value is not None: 950 checked_option.set('selected', '')
951
952 - def _value__del(self):
953 # FIXME: should del be allowed at all? 954 if self.multiple: 955 self.value.clear() 956 else: 957 self.value = None
958 959 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 960
961 - def value_options(self):
962 """ 963 All the possible values this select can have (the ``value`` 964 attribute of all the ``<option>`` elements. 965 """ 966 return [el.get('value') for el in self.getiterator('option')]
967 value_options = property(value_options, doc=value_options.__doc__) 968
969 - def _multiple__get(self):
970 """ 971 Boolean attribute: is there a ``multiple`` attribute on this element. 972 """ 973 return 'multiple' in self.attrib
974 - def _multiple__set(self, value):
975 if value: 976 self.set('multiple', '') 977 elif 'multiple' in self.attrib: 978 del self.attrib['multiple']
979 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
980 981 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 982
983 -class MultipleSelectOptions(SetMixin):
984 """ 985 Represents all the selected options in a ``<select multiple>`` element. 986 987 You can add to this set-like option to select an option, or remove 988 to unselect the option. 989 """ 990
991 - def __init__(self, select):
992 self.select = select
993
994 - def options(self):
995 """ 996 Iterator of all the ``<option>`` elements. 997 """ 998 return self.select.getiterator('option')
999 options = property(options) 1000
1001 - def __iter__(self):
1002 for option in self.options: 1003 yield option.get('value')
1004
1005 - def add(self, item):
1006 for option in self.options: 1007 if option.get('value') == item: 1008 option.set('selected', '') 1009 break 1010 else: 1011 raise ValueError( 1012 "There is no option with the value %r" % item)
1013
1014 - def remove(self, item):
1015 for option in self.options: 1016 if option.get('value') == item: 1017 if 'selected' in option.attrib: 1018 del option.attrib['selected'] 1019 else: 1020 raise ValueError( 1021 "The option %r is not currently selected" % item) 1022 break 1023 else: 1024 raise ValueError( 1025 "There is not option with the value %r" % item)
1026
1027 - def __repr__(self):
1028 return '<%s {%s} for select name=%r>' % ( 1029 self.__class__.__name__, 1030 ', '.join([repr(v) for v in self]), 1031 self.select.name)
1032
1033 -class RadioGroup(list):
1034 """ 1035 This object represents several ``<input type=radio>`` elements 1036 that have the same name. 1037 1038 You can use this like a list, but also use the property 1039 ``.value`` to check/uncheck inputs. Also you can use 1040 ``.value_options`` to get the possible values. 1041 """ 1042
1043 - def _value__get(self):
1044 """ 1045 Get/set the value, which checks the radio with that value (and 1046 unchecks any other value). 1047 """ 1048 for el in self: 1049 if 'checked' in el.attrib: 1050 return el.get('value') 1051 return None
1052
1053 - def _value__set(self, value):
1054 if value is not None: 1055 for el in self: 1056 if el.get('value') == value: 1057 checked_option = el 1058 break 1059 else: 1060 raise ValueError( 1061 "There is no radio input with the value %r" % value) 1062 for el in self: 1063 if 'checked' in el.attrib: 1064 del el.attrib['checked'] 1065 if value is not None: 1066 checked_option.set('checked', '')
1067
1068 - def _value__del(self):
1069 self.value = None
1070 1071 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1072
1073 - def value_options(self):
1074 """ 1075 Returns a list of all the possible values. 1076 """ 1077 return [el.get('value') for el in self]
1078 value_options = property(value_options, doc=value_options.__doc__) 1079
1080 - def __repr__(self):
1081 return '%s(%s)' % ( 1082 self.__class__.__name__, 1083 list.__repr__(self))
1084
1085 -class CheckboxGroup(list):
1086 """ 1087 Represents a group of checkboxes (``<input type=checkbox>``) that 1088 have the same name. 1089 1090 In addition to using this like a list, the ``.value`` attribute 1091 returns a set-like object that you can add to or remove from to 1092 check and uncheck checkboxes. You can also use ``.value_options`` 1093 to get the possible values. 1094 """ 1095
1096 - def _value__get(self):
1097 """ 1098 Return a set-like object that can be modified to check or 1099 uncheck individual checkboxes according to their value. 1100 """ 1101 return CheckboxValues(self)
1102 - def _value__set(self, value):
1103 self.value.clear() 1104 if not hasattr(value, '__iter__'): 1105 raise ValueError( 1106 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1107 % (self[0].name, value)) 1108 self.value.update(value)
1109 - def _value__del(self):
1110 self.value.clear()
1111 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1112
1113 - def __repr__(self):
1114 return '%s(%s)' % ( 1115 self.__class__.__name__, list.__repr__(self))
1116
1117 -class CheckboxValues(SetMixin):
1118 1119 """ 1120 Represents the values of the checked checkboxes in a group of 1121 checkboxes with the same name. 1122 """ 1123
1124 - def __init__(self, group):
1125 self.group = group
1126
1127 - def __iter__(self):
1128 return iter([ 1129 el.get('value') 1130 for el in self.group 1131 if 'checked' in el.attrib])
1132
1133 - def add(self, value):
1134 for el in self.group: 1135 if el.get('value') == value: 1136 el.set('checked', '') 1137 break 1138 else: 1139 raise KeyError("No checkbox with value %r" % value)
1140
1141 - def remove(self, value):
1142 for el in self.group: 1143 if el.get('value') == value: 1144 if 'checked' in el.attrib: 1145 del el.attrib['checked'] 1146 else: 1147 raise KeyError( 1148 "The checkbox with value %r was already unchecked" % value) 1149 break 1150 else: 1151 raise KeyError( 1152 "No checkbox with value %r" % value)
1153
1154 - def __repr__(self):
1155 return '<%s {%s} for checkboxes name=%r>' % ( 1156 self.__class__.__name__, 1157 ', '.join([repr(v) for v in self]), 1158 self.group.name)
1159
1160 -class InputElement(InputMixin, HtmlElement):
1161 """ 1162 Represents an ``<input>`` element. 1163 1164 You can get the type with ``.type`` (which is lower-cased and 1165 defaults to ``'text'``). 1166 1167 Also you can get and set the value with ``.value`` 1168 1169 Checkboxes and radios have the attribute ``input.checkable == 1170 True`` (for all others it is false) and a boolean attribute 1171 ``.checked``. 1172 1173 """ 1174 1175 ## FIXME: I'm a little uncomfortable with the use of .checked
1176 - def _value__get(self):
1177 """ 1178 Get/set the value of this element, using the ``value`` attribute. 1179 1180 Also, if this is a checkbox and it has no value, this defaults 1181 to ``'on'``. If it is a checkbox or radio that is not 1182 checked, this returns None. 1183 """ 1184 if self.checkable: 1185 if self.checked: 1186 return self.get('value') or 'on' 1187 else: 1188 return None 1189 return self.get('value')
1190 - def _value__set(self, value):
1191 if self.checkable: 1192 if not value: 1193 self.checked = False 1194 else: 1195 self.checked = True 1196 if isinstance(value, basestring): 1197 self.set('value', value) 1198 else: 1199 self.set('value', value)
1200 - def _value__del(self):
1201 if self.checkable: 1202 self.checked = False 1203 else: 1204 if 'value' in self.attrib: 1205 del self.attrib['value']
1206 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1207
1208 - def _type__get(self):
1209 """ 1210 Return the type of this element (using the type attribute). 1211 """ 1212 return self.get('type', 'text').lower()
1213 - def _type__set(self, value):
1214 self.set('type', value)
1215 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1216
1217 - def checkable(self):
1218 """ 1219 Boolean: can this element be checked? 1220 """ 1221 return self.type in ['checkbox', 'radio']
1222 checkable = property(checkable, doc=checkable.__doc__) 1223
1224 - def _checked__get(self):
1225 """ 1226 Boolean attribute to get/set the presence of the ``checked`` 1227 attribute. 1228 1229 You can only use this on checkable input types. 1230 """ 1231 if not self.checkable: 1232 raise AttributeError('Not a checkable input type') 1233 return 'checked' in self.attrib
1234 - def _checked__set(self, value):
1235 if not self.checkable: 1236 raise AttributeError('Not a checkable input type') 1237 if value: 1238 self.set('checked', '') 1239 else: 1240 if 'checked' in self.attrib: 1241 del self.attrib['checked']
1242 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1243 1244 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1245
1246 -class LabelElement(HtmlElement):
1247 """ 1248 Represents a ``<label>`` element. 1249 1250 Label elements are linked to other elements with their ``for`` 1251 attribute. You can access this element with ``label.for_element``. 1252 """ 1253
1254 - def _for_element__get(self):
1255 """ 1256 Get/set the element this label points to. Return None if it 1257 can't be found. 1258 """ 1259 id = self.get('for') 1260 if not id: 1261 return None 1262 return self.body.get_element_by_id(id)
1263 - def _for_element__set(self, other):
1264 id = other.get('id') 1265 if not id: 1266 raise TypeError( 1267 "Element %r has no id attribute" % other) 1268 self.set('for', id)
1269 - def _for_element__del(self):
1270 if 'id' in self.attrib: 1271 del self.attrib['id']
1272 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1273 doc=_for_element__get.__doc__)
1274 1275 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1276 1277 ############################################################ 1278 ## Serialization 1279 ############################################################ 1280 1281 # This isn't a general match, but it's a match for what libxml2 1282 # specifically serialises: 1283 __replace_meta_content_type = re.compile( 1284 r'<meta http-equiv="Content-Type".*?>').sub 1285
1286 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1287 encoding=None, method="html"):
1288 """Return an HTML string representation of the document. 1289 1290 Note: if include_meta_content_type is true this will create a 1291 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1292 regardless of the value of include_meta_content_type any existing 1293 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1294 1295 The ``encoding`` argument controls the output encoding (defauts to 1296 ASCII, with &#...; character references for any characters outside 1297 of ASCII). 1298 1299 The ``method`` argument defines the output method. It defaults to 1300 'html', but can also be 'xml' for xhtml output, or 'text' to 1301 serialise to plain text without markup. Note that you can pass 1302 the builtin ``unicode`` type as ``encoding`` argument to serialise 1303 to a unicode string. 1304 1305 Example:: 1306 1307 >>> from lxml import html 1308 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1309 1310 >>> html.tostring(root) 1311 '<p>Hello<br>world!</p>' 1312 >>> html.tostring(root, method='html') 1313 '<p>Hello<br>world!</p>' 1314 1315 >>> html.tostring(root, method='xml') 1316 '<p>Hello<br/>world!</p>' 1317 1318 >>> html.tostring(root, method='text') 1319 'Helloworld!' 1320 1321 >>> html.tostring(root, method='text', encoding=unicode) 1322 u'Helloworld!' 1323 """ 1324 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1325 encoding=encoding) 1326 if not include_meta_content_type: 1327 html = __replace_meta_content_type('', html) 1328 return html
1329
1330 -def open_in_browser(doc):
1331 """ 1332 Open the HTML document in a web browser (saving it to a temporary 1333 file to open it). 1334 """ 1335 import os 1336 import webbrowser 1337 try: 1338 write_doc = doc.write 1339 except AttributeError: 1340 write_doc = etree.ElementTree(element=doc).write 1341 fn = os.tempnam() + '.html' 1342 write_doc(fn, method="html") 1343 url = 'file://' + fn.replace(os.path.sep, '/') 1344 print url 1345 webbrowser.open(url)
1346 1347 ################################################################################ 1348 # configure Element class lookup 1349 ################################################################################ 1350
1351 -class HTMLParser(etree.HTMLParser):
1352 - def __init__(self, **kwargs):
1353 super(HTMLParser, self).__init__(**kwargs) 1354 self.set_element_class_lookup(HtmlElementClassLookup())
1355
1356 -class XHTMLParser(etree.XMLParser):
1357 - def __init__(self, **kwargs):
1358 super(XHTMLParser, self).__init__(**kwargs) 1359 self.set_element_class_lookup(HtmlElementClassLookup())
1360
1361 -def Element(*args, **kw):
1362 """Create a new HTML Element. 1363 1364 This can also be used for XHTML documents. 1365 """ 1366 v = html_parser.makeelement(*args, **kw) 1367 return v
1368 1369 html_parser = HTMLParser() 1370 xhtml_parser = XHTMLParser() 1371