1 """The ``lxml.html`` tool set for HTML handling.
2 """
3
4 import threading
5 import re
6 import urlparse
7 import copy
8 from lxml import etree
9 from lxml.html import defs
10 from lxml import cssselect
11 from lxml.html._setmixin import SetMixin
12 try:
13 from UserDict import DictMixin
14 except ImportError:
15
16 from lxml.html._dictmixin import DictMixin
17 import sets
18
19 __all__ = [
20 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
21 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
22 'find_rel_links', 'find_class', 'make_links_absolute',
23 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
24
25 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
26
27 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
28 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
29 _collect_string_content = etree.XPath("string()")
30 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
31 _css_import_re = re.compile(r'@import "(.*?)"')
32 _label_xpath = etree.XPath("//label[@for=$id]")
33 _archive_re = re.compile(r'[^ ]+')
34
36
38 """
39 Returns the base URL, given when the page was parsed.
40
41 Use with ``urlparse.urljoin(el.base_url, href)`` to get
42 absolute URLs.
43 """
44 return self.getroottree().docinfo.URL
45 base_url = property(base_url, doc=base_url.__doc__)
46
52 forms = property(forms, doc=forms.__doc__)
53
55 """
56 Return the <body> element. Can be called from a child element
57 to get the document's head.
58 """
59 return self.xpath('//body')[0]
60 body = property(body, doc=body.__doc__)
61
63 """
64 Returns the <head> element. Can be called from a child
65 element to get the document's head.
66 """
67 return self.xpath('//head')[0]
68 head = property(head, doc=head.__doc__)
69
71 """
72 Get or set any <label> element associated with this element.
73 """
74 id = self.get('id')
75 if not id:
76 return None
77 result = _label_xpath(self, id=id)
78 if not result:
79 return None
80 else:
81 return result[0]
83 id = self.get('id')
84 if not id:
85 raise TypeError(
86 "You cannot set a label for an element (%r) that has no id"
87 % self)
88 if not label.tag == 'label':
89 raise TypeError(
90 "You can only assign label to a label element (not %r)"
91 % label)
92 label.set('for', id)
94 label = self.label
95 if label is not None:
96 del label.attrib['for']
97 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
98
100 """
101 Removes this element from the tree, including its children and
102 text. The tail text is joined to the previous element or
103 parent.
104 """
105 parent = self.getparent()
106 assert parent is not None
107 if self.tail:
108 previous = self.getprevious()
109 if previous is None:
110 parent.text = (parent.text or '') + self.tail
111 else:
112 previous.tail = (previous.tail or '') + self.tail
113 parent.remove(self)
114
116 """
117 Remove the tag, but not its children or text. The children and text
118 are merged into the parent.
119
120 Example::
121
122 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
123 >>> h.find('.//b').drop_tag()
124 >>> print tostring(h)
125 <div>Hello World!</div>
126 """
127 parent = self.getparent()
128 assert parent is not None
129 previous = self.getprevious()
130 if self.text and isinstance(self.tag, basestring):
131
132 if previous is None:
133 parent.text = (parent.text or '') + self.text
134 else:
135 previous.tail = (previous.tail or '') + self.text
136 if self.tail:
137 if len(self):
138 last = self[-1]
139 last.tail = (last.tail or '') + self.tail
140 elif previous is None:
141 parent.text = (parent.text or '') + self.tail
142 else:
143 previous.tail = (previous.tail or '') + self.tail
144 index = parent.index(self)
145 parent[index:index+1] = self[:]
146
148 """
149 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
150 """
151 rel = rel.lower()
152 return [el for el in _rel_links_xpath(self)
153 if el.get('rel').lower() == rel]
154
156 """
157 Find any elements with the given class name.
158 """
159 return _class_xpath(self, class_name=class_name)
160
162 """
163 Get the first element in a document with the given id. If none is
164 found, return the default argument if provided or raise KeyError
165 otherwise.
166
167 Note that there can be more than one element with the same id,
168 and this isn't uncommon in HTML documents found in the wild.
169 Browsers return only the first match, and this function does
170 the same.
171 """
172 try:
173
174
175 return _id_xpath(self, id=id)[0]
176 except IndexError:
177 if default:
178 return default[0]
179 else:
180 raise KeyError, id
181
182 - def text_content(self):
183 """
184 Return the text content of the tag (and the text in any children).
185 """
186 return _collect_string_content(self)
187
189 """
190 Run the CSS expression on this element and its children,
191 returning a list of the results.
192
193 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
194 that pre-compiling the expression can provide a substantial
195 speedup.
196 """
197 return cssselect.CSSSelector(expr)(self)
198
199
200
201
202
204 """
205 Make all links in the document absolute, given the
206 ``base_url`` for the document (the full URL where the document
207 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
208
209 If ``resolve_base_href`` is true, then any ``<base href>``
210 tags in the document are used *and* removed from the document.
211 If it is false then any such tag is ignored.
212 """
213 if base_url is None:
214 base_url = self.base_url
215 if base_url is None:
216 raise TypeError(
217 "No base_url given, and the document has no base_url")
218 if resolve_base_href:
219 self.resolve_base_href()
220 def link_repl(href):
221 return urlparse.urljoin(base_url, href)
222 self.rewrite_links(link_repl)
223
225 """
226 Find any ``<base href>`` tag in the document, and apply its
227 values to all links found in the document. Also remove the
228 tag once it has been applied.
229 """
230 base_href = None
231 basetags = self.xpath('//base[@href]')
232 for b in basetags:
233 base_href = b.get('href')
234 b.drop_tree()
235 if not base_href:
236 return
237 self.make_links_absolute(base_href, resolve_base_href=False)
238
240 """
241 Yield (element, attribute, link, pos), where attribute may be None
242 (indicating the link is in the text). ``pos`` is the position
243 where the link occurs; often 0, but sometimes something else in
244 the case of links in stylesheets or style tags.
245
246 Note: <base href> is *not* taken into account in any way. The
247 link you get is exactly the link in the document.
248 """
249 link_attrs = defs.link_attrs
250 for el in self.getiterator():
251 attribs = el.attrib
252 if el.tag != 'object':
253 for attrib in link_attrs:
254 if attrib in attribs:
255 yield (el, attrib, attribs[attrib], 0)
256 elif el.tag == 'object':
257 codebase = None
258
259
260 if 'codebase' in attribs:
261 codebase = el.get('codebase')
262 yield (el, 'codebase', codebase, 0)
263 for attrib in 'classid', 'data':
264 if attrib in attribs:
265 value = el.get(attrib)
266 if codebase is not None:
267 value = urlparse.urljoin(codebase, value)
268 yield (el, attrib, value, 0)
269 if 'archive' in attribs:
270 for match in _archive_re.finditer(el.get('archive')):
271 value = match.group(0)
272 if codebase is not None:
273 value = urlparse.urljoin(codebase, value)
274 yield (el, 'archive', value, match.start())
275 if el.tag == 'param':
276 valuetype = el.get('valuetype') or ''
277 if valuetype.lower() == 'ref':
278
279
280
281
282
283
284 yield (el, 'value', el.get('value'), 0)
285 if el.tag == 'style' and el.text:
286 for match in _css_url_re.finditer(el.text):
287 yield (el, None, match.group(1), match.start(1))
288 for match in _css_import_re.finditer(el.text):
289 yield (el, None, match.group(1), match.start(1))
290 if 'style' in attribs:
291 for match in _css_url_re.finditer(attribs['style']):
292 yield (el, 'style', match.group(1), match.start(1))
293
294 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
295 base_href=None):
296 """
297 Rewrite all the links in the document. For each link
298 ``link_repl_func(link)`` will be called, and the return value
299 will replace the old link.
300
301 Note that links may not be absolute (unless you first called
302 ``make_links_absolute()``), and may be internal (e.g.,
303 ``'#anchor'``). They can also be values like
304 ``'mailto:email'`` or ``'javascript:expr'``.
305
306 If you give ``base_href`` then all links passed to
307 ``link_repl_func()`` will take that into account.
308
309 If the ``link_repl_func`` returns None, the attribute or
310 tag text will be removed completely.
311 """
312 if base_href is not None:
313
314
315 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
316 elif resolve_base_href:
317 self.resolve_base_href()
318 for el, attrib, link, pos in self.iterlinks():
319 new_link = link_repl_func(link)
320 if new_link == link:
321 continue
322 if new_link is None:
323
324 if attrib is None:
325 el.text = ''
326 else:
327 del el.attrib[attrib]
328 continue
329 if attrib is None:
330 new = el.text[:pos] + new_link + el.text[pos+len(link):]
331 el.text = new
332 else:
333 cur = el.attrib[attrib]
334 if not pos and len(cur) == len(link):
335
336 el.attrib[attrib] = new_link
337 else:
338 new = cur[:pos] + new_link + cur[pos+len(link):]
339 el.attrib[attrib] = new
340
341
343 """
344 An object that represents a method on an element as a function;
345 the function takes either an element or an HTML string. It
346 returns whatever the function normally returns, or if the function
347 works in-place (and so returns None) it returns a serialized form
348 of the resulting document.
349 """
350 - def __init__(self, name, copy=False, source_class=HtmlMixin):
355 if isinstance(doc, basestring):
356 if 'copy' in kw:
357 raise TypeError(
358 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
359 return_string = True
360 doc = fromstring(doc, **kw)
361 else:
362 if 'copy' in kw:
363 copy = kw.pop('copy')
364 else:
365 copy = self.copy
366 return_string = False
367 if copy:
368 doc = copy.deepcopy(doc)
369 meth = getattr(doc, self.name)
370 result = meth(*args, **kw)
371
372 if result is None:
373
374 if return_string:
375 return tostring(doc)
376 else:
377 return doc
378 else:
379 return result
380
381 find_rel_links = _MethodFunc('find_rel_links', copy=False)
382 find_class = _MethodFunc('find_class', copy=False)
383 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
384 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
385 iterlinks = _MethodFunc('iterlinks', copy=False)
386 rewrite_links = _MethodFunc('rewrite_links', copy=True)
387
390
393
396
399
400
402 """A lookup scheme for HTML Element classes.
403
404 To create a lookup instance with different Element classes, pass a tag
405 name mapping of Element classes in the ``classes`` keyword argument and/or
406 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
407 The special key '*' denotes a Mixin class that should be mixed into all
408 Element classes.
409 """
410 _default_element_classes = {}
411
412 - def __init__(self, classes=None, mixins=None):
413 etree.CustomElementClassLookup.__init__(self)
414 if classes is None:
415 classes = self._default_element_classes.copy()
416 if mixins:
417 mixers = {}
418 for name, value in mixins:
419 if name == '*':
420 for n in classes.keys():
421 mixers.setdefault(n, []).append(value)
422 else:
423 mixers.setdefault(name, []).append(value)
424 for name, mix_bases in mixers.items():
425 cur = classes.get(name, HtmlElement)
426 bases = tuple(mix_bases + [cur])
427 classes[name] = type(cur.__name__, bases, {})
428 self._element_classes = classes
429
430 - def lookup(self, node_type, document, namespace, name):
431 if node_type == 'element':
432 return self._element_classes.get(name.lower(), HtmlElement)
433 elif node_type == 'comment':
434 return HtmlComment
435 elif node_type == 'PI':
436 return HtmlProcessingInstruction
437 elif node_type == 'entity':
438 return HtmlEntity
439
440 return None
441
442
443
444
445
454
457 """
458 Parses several HTML elements, returning a list of elements.
459
460 The first item in the list may be a string (though leading
461 whitespace is removed). If no_leading_text is true, then it will
462 be an error if there is leading text, and it will always be a list
463 of only elements.
464
465 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
466 """
467 if parser is None:
468 parser = html_parser
469
470 start = html[:20].lstrip().lower()
471 if not start.startswith('<html') and not start.startswith('<!doctype'):
472 html = '<html><body>%s</body></html>' % html
473 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
474 assert doc.tag == 'html'
475 bodies = [e for e in doc if e.tag == 'body']
476 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
477 body = bodies[0]
478 elements = []
479 if no_leading_text and body.text and body.text.strip():
480 raise etree.ParserError(
481 "There is leading text: %r" % body.text)
482 if body.text and body.text.strip():
483 elements.append(body.text)
484 elements.extend(body)
485
486
487 return elements
488
491 """
492 Parses a single HTML element; it is an error if there is more than
493 one element, or if anything but whitespace precedes or follows the
494 element.
495
496 If create_parent is true (or is a tag name) then a parent node
497 will be created to encapsulate the HTML in a single element.
498
499 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
500 """
501 if parser is None:
502 parser = html_parser
503 if create_parent:
504 if not isinstance(create_parent, basestring):
505 create_parent = 'div'
506 return fragment_fromstring('<%s>%s</%s>' % (
507 create_parent, html, create_parent),
508 parser=parser, base_url=base_url, **kw)
509 elements = fragments_fromstring(html, parser=parser, no_leading_text=True,
510 base_url=base_url, **kw)
511 if not elements:
512 raise etree.ParserError(
513 "No elements found")
514 if len(elements) > 1:
515 raise etree.ParserError(
516 "Multiple elements found (%s)"
517 % ', '.join([_element_name(e) for e in elements]))
518 el = elements[0]
519 if el.tail and el.tail.strip():
520 raise etree.ParserError(
521 "Element followed by text: %r" % el.tail)
522 el.tail = None
523 return el
524
525 -def fromstring(html, base_url=None, parser=None, **kw):
526 """
527 Parse the html, returning a single element/document.
528
529 This tries to minimally parse the chunk of text, without knowing if it
530 is a fragment or a document.
531
532 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
533 """
534 if parser is None:
535 parser = html_parser
536 start = html[:10].lstrip().lower()
537 if start.startswith('<html') or start.startswith('<!doctype'):
538
539 return document_fromstring(html, parser=parser, base_url=base_url, **kw)
540
541 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
542 bodies = doc.findall('body')
543 if bodies:
544 body = bodies[0]
545 if len(bodies) > 1:
546
547
548 for other_body in bodies[1:]:
549 if other_body.text:
550 if len(body):
551 body[-1].tail = (body[-1].tail or '') + other_body.text
552 else:
553 body.text = (body.text or '') + other_body.text
554 body.extend(other_body)
555
556
557 other_body.drop_tree()
558 else:
559 body = None
560 heads = doc.findall('head')
561 if heads:
562
563 head = heads[0]
564 if len(heads) > 1:
565 for other_head in heads[1:]:
566 head.extend(other_head)
567
568 other_head.drop_tree()
569 return doc
570 if (len(body) == 1 and (not body.text or not body.text.strip())
571 and (not body[-1].tail or not body[-1].tail.strip())):
572
573
574 return body[0]
575
576
577
578 if _contains_block_level_tag(body):
579 body.tag = 'div'
580 else:
581 body.tag = 'span'
582 return body
583
584 -def parse(filename_or_url, parser=None, base_url=None, **kw):
585 """
586 Parse a filename, URL, or file-like object into an HTML document
587 tree. Note: this returns a tree, not an element. Use
588 ``parse(...).getroot()`` to get the document root.
589
590 You can override the base URL with the ``base_url`` keyword. This
591 is most useful when parsing from a file-like object.
592 """
593 if parser is None:
594 parser = html_parser
595 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
596
598
599
600 for el in el.getiterator():
601 if el.tag in defs.block_tags:
602 return True
603 return False
604
606 if isinstance(el, etree.CommentBase):
607 return 'comment'
608 elif isinstance(el, basestring):
609 return 'string'
610 else:
611 return el.tag
612
613
614
615
616
717
718 HtmlElementClassLookup._default_element_classes['form'] = FormElement
719
752
754 import urllib
755
756 if method == 'GET':
757 if '?' in url:
758 url += '&'
759 else:
760 url += '?'
761 url += urllib.urlencode(values)
762 data = None
763 else:
764 data = urllib.urlencode(values)
765 return urllib.urlopen(url, data)
766
768
772 return self.inputs[item].value
774 self.inputs[item].value = value
776 raise KeyError(
777 "You cannot remove keys from ElementDict")
779 return self.inputs.keys()
781 return item in self.inputs
782
784 return '<%s for form %s>' % (
785 self.__class__.__name__,
786 self.inputs.form._name())
787
852
880
881 -class TextareaElement(InputMixin, HtmlElement):
882 """
883 ``<textarea>`` element. You can get the name with ``.name`` and
884 get/set the value with ``.value``
885 """
886
887 - def _value__get(self):
888 """
889 Get/set the value (which is the contents of this element)
890 """
891 return self.text or ''
892 - def _value__set(self, value):
894 - def _value__del(self):
896 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
897
898 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
899
901 """
902 ``<select>`` element. You can get the name with ``.name``.
903
904 ``.value`` will be the value of the selected option, unless this
905 is a multi-select element (``<select multiple>``), in which case
906 it will be a set-like object. In either case ``.value_options``
907 gives the possible values.
908
909 The boolean attribute ``.multiple`` shows if this is a
910 multi-select.
911 """
912
914 """
915 Get/set the value of this select (the selected option).
916
917 If this is a multi-select, this is a set-like object that
918 represents all the selected options.
919 """
920 if self.multiple:
921 return MultipleSelectOptions(self)
922 for el in self.getiterator('option'):
923 if 'selected' in el.attrib:
924 value = el.get('value')
925
926 return value
927 return None
928
930 if self.multiple:
931 if isinstance(value, basestring):
932 raise TypeError(
933 "You must pass in a sequence")
934 self.value.clear()
935 self.value.update(value)
936 return
937 if value is not None:
938 for el in self.getiterator('option'):
939
940 if el.get('value') == value:
941 checked_option = el
942 break
943 else:
944 raise ValueError(
945 "There is no option with the value of %r" % value)
946 for el in self.getiterator('option'):
947 if 'selected' in el.attrib:
948 del el.attrib['selected']
949 if value is not None:
950 checked_option.set('selected', '')
951
953
954 if self.multiple:
955 self.value.clear()
956 else:
957 self.value = None
958
959 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
960
962 """
963 All the possible values this select can have (the ``value``
964 attribute of all the ``<option>`` elements.
965 """
966 return [el.get('value') for el in self.getiterator('option')]
967 value_options = property(value_options, doc=value_options.__doc__)
968
970 """
971 Boolean attribute: is there a ``multiple`` attribute on this element.
972 """
973 return 'multiple' in self.attrib
975 if value:
976 self.set('multiple', '')
977 elif 'multiple' in self.attrib:
978 del self.attrib['multiple']
979 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
980
981 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
982
984 """
985 Represents all the selected options in a ``<select multiple>`` element.
986
987 You can add to this set-like option to select an option, or remove
988 to unselect the option.
989 """
990
993
995 """
996 Iterator of all the ``<option>`` elements.
997 """
998 return self.select.getiterator('option')
999 options = property(options)
1000
1002 for option in self.options:
1003 yield option.get('value')
1004
1005 - def add(self, item):
1006 for option in self.options:
1007 if option.get('value') == item:
1008 option.set('selected', '')
1009 break
1010 else:
1011 raise ValueError(
1012 "There is no option with the value %r" % item)
1013
1015 for option in self.options:
1016 if option.get('value') == item:
1017 if 'selected' in option.attrib:
1018 del option.attrib['selected']
1019 else:
1020 raise ValueError(
1021 "The option %r is not currently selected" % item)
1022 break
1023 else:
1024 raise ValueError(
1025 "There is not option with the value %r" % item)
1026
1028 return '<%s {%s} for select name=%r>' % (
1029 self.__class__.__name__,
1030 ', '.join([repr(v) for v in self]),
1031 self.select.name)
1032
1034 """
1035 This object represents several ``<input type=radio>`` elements
1036 that have the same name.
1037
1038 You can use this like a list, but also use the property
1039 ``.value`` to check/uncheck inputs. Also you can use
1040 ``.value_options`` to get the possible values.
1041 """
1042
1044 """
1045 Get/set the value, which checks the radio with that value (and
1046 unchecks any other value).
1047 """
1048 for el in self:
1049 if 'checked' in el.attrib:
1050 return el.get('value')
1051 return None
1052
1054 if value is not None:
1055 for el in self:
1056 if el.get('value') == value:
1057 checked_option = el
1058 break
1059 else:
1060 raise ValueError(
1061 "There is no radio input with the value %r" % value)
1062 for el in self:
1063 if 'checked' in el.attrib:
1064 del el.attrib['checked']
1065 if value is not None:
1066 checked_option.set('checked', '')
1067
1070
1071 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1072
1074 """
1075 Returns a list of all the possible values.
1076 """
1077 return [el.get('value') for el in self]
1078 value_options = property(value_options, doc=value_options.__doc__)
1079
1081 return '%s(%s)' % (
1082 self.__class__.__name__,
1083 list.__repr__(self))
1084
1086 """
1087 Represents a group of checkboxes (``<input type=checkbox>``) that
1088 have the same name.
1089
1090 In addition to using this like a list, the ``.value`` attribute
1091 returns a set-like object that you can add to or remove from to
1092 check and uncheck checkboxes. You can also use ``.value_options``
1093 to get the possible values.
1094 """
1095
1097 """
1098 Return a set-like object that can be modified to check or
1099 uncheck individual checkboxes according to their value.
1100 """
1101 return CheckboxValues(self)
1103 self.value.clear()
1104 if not hasattr(value, '__iter__'):
1105 raise ValueError(
1106 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1107 % (self[0].name, value))
1108 self.value.update(value)
1111 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1112
1114 return '%s(%s)' % (
1115 self.__class__.__name__, list.__repr__(self))
1116
1118
1119 """
1120 Represents the values of the checked checkboxes in a group of
1121 checkboxes with the same name.
1122 """
1123
1126
1128 return iter([
1129 el.get('value')
1130 for el in self.group
1131 if 'checked' in el.attrib])
1132
1133 - def add(self, value):
1134 for el in self.group:
1135 if el.get('value') == value:
1136 el.set('checked', '')
1137 break
1138 else:
1139 raise KeyError("No checkbox with value %r" % value)
1140
1142 for el in self.group:
1143 if el.get('value') == value:
1144 if 'checked' in el.attrib:
1145 del el.attrib['checked']
1146 else:
1147 raise KeyError(
1148 "The checkbox with value %r was already unchecked" % value)
1149 break
1150 else:
1151 raise KeyError(
1152 "No checkbox with value %r" % value)
1153
1155 return '<%s {%s} for checkboxes name=%r>' % (
1156 self.__class__.__name__,
1157 ', '.join([repr(v) for v in self]),
1158 self.group.name)
1159
1243
1244 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1245
1247 """
1248 Represents a ``<label>`` element.
1249
1250 Label elements are linked to other elements with their ``for``
1251 attribute. You can access this element with ``label.for_element``.
1252 """
1253
1255 """
1256 Get/set the element this label points to. Return None if it
1257 can't be found.
1258 """
1259 id = self.get('for')
1260 if not id:
1261 return None
1262 return self.body.get_element_by_id(id)
1264 id = other.get('id')
1265 if not id:
1266 raise TypeError(
1267 "Element %r has no id attribute" % other)
1268 self.set('for', id)
1272 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1273 doc=_for_element__get.__doc__)
1274
1275 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1276
1277
1278
1279
1280
1281
1282
1283 __replace_meta_content_type = re.compile(
1284 r'<meta http-equiv="Content-Type".*?>').sub
1285
1286 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1287 encoding=None, method="html"):
1288 """Return an HTML string representation of the document.
1289
1290 Note: if include_meta_content_type is true this will create a
1291 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1292 regardless of the value of include_meta_content_type any existing
1293 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1294
1295 The ``encoding`` argument controls the output encoding (defauts to
1296 ASCII, with &#...; character references for any characters outside
1297 of ASCII).
1298
1299 The ``method`` argument defines the output method. It defaults to
1300 'html', but can also be 'xml' for xhtml output, or 'text' to
1301 serialise to plain text without markup. Note that you can pass
1302 the builtin ``unicode`` type as ``encoding`` argument to serialise
1303 to a unicode string.
1304
1305 Example::
1306
1307 >>> from lxml import html
1308 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1309
1310 >>> html.tostring(root)
1311 '<p>Hello<br>world!</p>'
1312 >>> html.tostring(root, method='html')
1313 '<p>Hello<br>world!</p>'
1314
1315 >>> html.tostring(root, method='xml')
1316 '<p>Hello<br/>world!</p>'
1317
1318 >>> html.tostring(root, method='text')
1319 'Helloworld!'
1320
1321 >>> html.tostring(root, method='text', encoding=unicode)
1322 u'Helloworld!'
1323 """
1324 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1325 encoding=encoding)
1326 if not include_meta_content_type:
1327 html = __replace_meta_content_type('', html)
1328 return html
1329
1331 """
1332 Open the HTML document in a web browser (saving it to a temporary
1333 file to open it).
1334 """
1335 import os
1336 import webbrowser
1337 try:
1338 write_doc = doc.write
1339 except AttributeError:
1340 write_doc = etree.ElementTree(element=doc).write
1341 fn = os.tempnam() + '.html'
1342 write_doc(fn, method="html")
1343 url = 'file://' + fn.replace(os.path.sep, '/')
1344 print url
1345 webbrowser.open(url)
1346
1347
1348
1349
1350
1355
1360
1362 """Create a new HTML Element.
1363
1364 This can also be used for XHTML documents.
1365 """
1366 v = html_parser.makeelement(*args, **kw)
1367 return v
1368
1369 html_parser = HTMLParser()
1370 xhtml_parser = XHTMLParser()
1371