Skip to content

Commit 1492be5

Browse files
authored
[RFC] DOM HTML5 parsing and serialization support (#12111)
1 parent 17d2917 commit 1492be5

File tree

162 files changed

+8283
-1212
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

162 files changed

+8283
-1212
lines changed

NEWS

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ DOM:
1010
. Implement #53655 (Improve speed of DOMNode::C14N() on large XML documents).
1111
(nielsdos)
1212
. Fix cloning attribute with namespace disappearing namespace. (nielsdos)
13+
. Implement DOM HTML5 parsing and serialization RFC. (nielsdos)
1314

1415
FTP:
1516
. Removed the deprecated inet_ntoa call support. (David Carlier)

UPGRADING

+8
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ PHP 8.4 UPGRADE NOTES
8080
. Added constant DOMNode::DOCUMENT_POSITION_CONTAINS.
8181
. Added constant DOMNode::DOCUMENT_POSITION_CONTAINED_BY.
8282
. Added constant DOMNode::DOCUMENT_POSITION_IMPLEMENTATION_SPECIFIC.
83+
. Implemented DOM HTML5 parsing and serialization.
84+
RFC: https://wiki.php.net/rfc/domdocument_html5_parser.
85+
This RFC adds the new DOM namespace along with class and constant aliases.
86+
There are two new classes to handle HTML and XML documents:
87+
DOM\HTMLDocument and DOM\XMLDocument.
88+
These classes provide a cleaner API to handle HTML and XML documents.
89+
Furthermore, the DOM\HTMLDocument class implements spec-compliant HTML5
90+
parsing and serialization.
8391

8492
- Phar:
8593
. Added support for the unix timestamp extension for zip archives.

UPGRADING.INTERNALS

+4
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ PHP 8.4 INTERNALS UPGRADE NOTES
5252
- The function php_xsl_create_object() was removed as it was not used
5353
nor exported.
5454

55+
d. ext/libxml
56+
- Added php_libxml_pretend_ctx_error_ex() to emit errors as if they had come
57+
from libxml.
58+
5559
========================
5660
4. OpCode changes
5761
========================

ext/dom/config.m4

+17-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,21 @@ if test "$PHP_DOM" != "no"; then
1212

1313
PHP_SETUP_LIBXML(DOM_SHARED_LIBADD, [
1414
AC_DEFINE(HAVE_DOM,1,[ ])
15+
PHP_LEXBOR_CFLAGS="-I@ext_srcdir@/lexbor -DLEXBOR_STATIC"
16+
LEXBOR_DIR="lexbor/lexbor"
17+
LEXBOR_SOURCES="$LEXBOR_DIR/ports/posix/lexbor/core/memory.c \
18+
$LEXBOR_DIR/core/array_obj.c $LEXBOR_DIR/core/array.c $LEXBOR_DIR/core/avl.c $LEXBOR_DIR/core/bst.c $LEXBOR_DIR/core/diyfp.c $LEXBOR_DIR/core/conv.c $LEXBOR_DIR/core/dobject.c $LEXBOR_DIR/core/dtoa.c $LEXBOR_DIR/core/hash.c $LEXBOR_DIR/core/mem.c $LEXBOR_DIR/core/mraw.c $LEXBOR_DIR/core/print.c $LEXBOR_DIR/core/serialize.c $LEXBOR_DIR/core/shs.c $LEXBOR_DIR/core/str.c $LEXBOR_DIR/core/strtod.c \
19+
$LEXBOR_DIR/dom/interface.c $LEXBOR_DIR/dom/interfaces/attr.c $LEXBOR_DIR/dom/interfaces/cdata_section.c $LEXBOR_DIR/dom/interfaces/character_data.c $LEXBOR_DIR/dom/interfaces/comment.c $LEXBOR_DIR/dom/interfaces/document.c $LEXBOR_DIR/dom/interfaces/document_fragment.c $LEXBOR_DIR/dom/interfaces/document_type.c $LEXBOR_DIR/dom/interfaces/element.c $LEXBOR_DIR/dom/interfaces/node.c $LEXBOR_DIR/dom/interfaces/processing_instruction.c $LEXBOR_DIR/dom/interfaces/shadow_root.c $LEXBOR_DIR/dom/interfaces/text.c \
20+
$LEXBOR_DIR/html/tokenizer/error.c $LEXBOR_DIR/html/tokenizer/state_comment.c $LEXBOR_DIR/html/tokenizer/state_doctype.c $LEXBOR_DIR/html/tokenizer/state_rawtext.c $LEXBOR_DIR/html/tokenizer/state_rcdata.c $LEXBOR_DIR/html/tokenizer/state_script.c $LEXBOR_DIR/html/tokenizer/state.c \
21+
$LEXBOR_DIR/html/tree/active_formatting.c $LEXBOR_DIR/html/tree/error.c $LEXBOR_DIR/html/tree/insertion_mode/after_after_body.c $LEXBOR_DIR/html/tree/insertion_mode/after_after_frameset.c $LEXBOR_DIR/html/tree/insertion_mode/after_body.c $LEXBOR_DIR/html/tree/insertion_mode/after_frameset.c $LEXBOR_DIR/html/tree/insertion_mode/after_head.c $LEXBOR_DIR/html/tree/insertion_mode/before_head.c $LEXBOR_DIR/html/tree/insertion_mode/before_html.c $LEXBOR_DIR/html/tree/insertion_mode/foreign_content.c $LEXBOR_DIR/html/tree/insertion_mode/in_body.c $LEXBOR_DIR/html/tree/insertion_mode/in_caption.c $LEXBOR_DIR/html/tree/insertion_mode/in_cell.c $LEXBOR_DIR/html/tree/insertion_mode/in_column_group.c $LEXBOR_DIR/html/tree/insertion_mode/in_frameset.c $LEXBOR_DIR/html/tree/insertion_mode/in_head.c $LEXBOR_DIR/html/tree/insertion_mode/in_head_noscript.c $LEXBOR_DIR/html/tree/insertion_mode/initial.c $LEXBOR_DIR/html/tree/insertion_mode/in_row.c $LEXBOR_DIR/html/tree/insertion_mode/in_select.c $LEXBOR_DIR/html/tree/insertion_mode/in_select_in_table.c $LEXBOR_DIR/html/tree/insertion_mode/in_table_body.c $LEXBOR_DIR/html/tree/insertion_mode/in_table.c $LEXBOR_DIR/html/tree/insertion_mode/in_table_text.c $LEXBOR_DIR/html/tree/insertion_mode/in_template.c $LEXBOR_DIR/html/tree/insertion_mode/text.c $LEXBOR_DIR/html/tree/open_elements.c \
22+
$LEXBOR_DIR/encoding/big5.c $LEXBOR_DIR/encoding/decode.c $LEXBOR_DIR/encoding/encode.c $LEXBOR_DIR/encoding/encoding.c $LEXBOR_DIR/encoding/euc_kr.c $LEXBOR_DIR/encoding/gb18030.c $LEXBOR_DIR/encoding/iso_2022_jp_katakana.c $LEXBOR_DIR/encoding/jis0208.c $LEXBOR_DIR/encoding/jis0212.c $LEXBOR_DIR/encoding/range.c $LEXBOR_DIR/encoding/res.c $LEXBOR_DIR/encoding/single.c \
23+
$LEXBOR_DIR/html/encoding.c $LEXBOR_DIR/html/interface.c $LEXBOR_DIR/html/parser.c $LEXBOR_DIR/html/token.c $LEXBOR_DIR/html/token_attr.c $LEXBOR_DIR/html/tokenizer.c $LEXBOR_DIR/html/tree.c \
24+
$LEXBOR_DIR/html/interfaces/anchor_element.c $LEXBOR_DIR/html/interfaces/area_element.c $LEXBOR_DIR/html/interfaces/audio_element.c $LEXBOR_DIR/html/interfaces/base_element.c $LEXBOR_DIR/html/interfaces/body_element.c $LEXBOR_DIR/html/interfaces/br_element.c $LEXBOR_DIR/html/interfaces/button_element.c $LEXBOR_DIR/html/interfaces/canvas_element.c $LEXBOR_DIR/html/interfaces/data_element.c $LEXBOR_DIR/html/interfaces/data_list_element.c $LEXBOR_DIR/html/interfaces/details_element.c $LEXBOR_DIR/html/interfaces/dialog_element.c $LEXBOR_DIR/html/interfaces/directory_element.c $LEXBOR_DIR/html/interfaces/div_element.c $LEXBOR_DIR/html/interfaces/d_list_element.c $LEXBOR_DIR/html/interfaces/document.c $LEXBOR_DIR/html/interfaces/element.c $LEXBOR_DIR/html/interfaces/embed_element.c $LEXBOR_DIR/html/interfaces/field_set_element.c $LEXBOR_DIR/html/interfaces/font_element.c $LEXBOR_DIR/html/interfaces/form_element.c $LEXBOR_DIR/html/interfaces/frame_element.c $LEXBOR_DIR/html/interfaces/frame_set_element.c $LEXBOR_DIR/html/interfaces/head_element.c $LEXBOR_DIR/html/interfaces/heading_element.c $LEXBOR_DIR/html/interfaces/hr_element.c $LEXBOR_DIR/html/interfaces/html_element.c $LEXBOR_DIR/html/interfaces/iframe_element.c $LEXBOR_DIR/html/interfaces/image_element.c $LEXBOR_DIR/html/interfaces/input_element.c $LEXBOR_DIR/html/interfaces/label_element.c $LEXBOR_DIR/html/interfaces/legend_element.c $LEXBOR_DIR/html/interfaces/li_element.c $LEXBOR_DIR/html/interfaces/link_element.c $LEXBOR_DIR/html/interfaces/map_element.c $LEXBOR_DIR/html/interfaces/marquee_element.c $LEXBOR_DIR/html/interfaces/media_element.c $LEXBOR_DIR/html/interfaces/menu_element.c $LEXBOR_DIR/html/interfaces/meta_element.c $LEXBOR_DIR/html/interfaces/meter_element.c $LEXBOR_DIR/html/interfaces/mod_element.c $LEXBOR_DIR/html/interfaces/object_element.c $LEXBOR_DIR/html/interfaces/o_list_element.c $LEXBOR_DIR/html/interfaces/opt_group_element.c $LEXBOR_DIR/html/interfaces/option_element.c $LEXBOR_DIR/html/interfaces/output_element.c $LEXBOR_DIR/html/interfaces/paragraph_element.c $LEXBOR_DIR/html/interfaces/param_element.c $LEXBOR_DIR/html/interfaces/picture_element.c $LEXBOR_DIR/html/interfaces/pre_element.c $LEXBOR_DIR/html/interfaces/progress_element.c $LEXBOR_DIR/html/interfaces/quote_element.c $LEXBOR_DIR/html/interfaces/script_element.c $LEXBOR_DIR/html/interfaces/select_element.c $LEXBOR_DIR/html/interfaces/slot_element.c $LEXBOR_DIR/html/interfaces/source_element.c $LEXBOR_DIR/html/interfaces/span_element.c $LEXBOR_DIR/html/interfaces/style_element.c $LEXBOR_DIR/html/interfaces/table_caption_element.c $LEXBOR_DIR/html/interfaces/table_cell_element.c $LEXBOR_DIR/html/interfaces/table_col_element.c $LEXBOR_DIR/html/interfaces/table_element.c $LEXBOR_DIR/html/interfaces/table_row_element.c $LEXBOR_DIR/html/interfaces/table_section_element.c $LEXBOR_DIR/html/interfaces/template_element.c $LEXBOR_DIR/html/interfaces/text_area_element.c $LEXBOR_DIR/html/interfaces/time_element.c $LEXBOR_DIR/html/interfaces/title_element.c $LEXBOR_DIR/html/interfaces/track_element.c $LEXBOR_DIR/html/interfaces/u_list_element.c $LEXBOR_DIR/html/interfaces/unknown_element.c $LEXBOR_DIR/html/interfaces/video_element.c $LEXBOR_DIR/html/interfaces/window.c \
25+
$LEXBOR_DIR/selectors/selectors.c \
26+
$LEXBOR_DIR/ns/ns.c \
27+
$LEXBOR_DIR/tag/tag.c"
1528
PHP_NEW_EXTENSION(dom, [php_dom.c attr.c document.c \
29+
xml_document.c html_document.c html5_serializer.c html5_parser.c namespace_compat.c \
1630
domexception.c parentnode.c \
1731
processinginstruction.c cdatasection.c \
1832
documentfragment.c domimplementation.c \
@@ -21,8 +35,9 @@ if test "$PHP_DOM" != "no"; then
2135
nodelist.c text.c comment.c \
2236
entityreference.c \
2337
notation.c xpath.c dom_iterators.c \
24-
namednodemap.c],
25-
$ext_shared)
38+
namednodemap.c \
39+
$LEXBOR_SOURCES],
40+
$ext_shared,,$PHP_LEXBOR_CFLAGS)
2641
PHP_SUBST(DOM_SHARED_LIBADD)
2742
PHP_INSTALL_HEADERS([ext/dom/xml_common.h])
2843
PHP_ADD_EXTENSION_DEP(dom, libxml)

ext/dom/config.w32

+17-1
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,29 @@ if (PHP_DOM == "yes") {
88
CHECK_HEADER_ADD_INCLUDE("libxml/parser.h", "CFLAGS_DOM", PHP_PHP_BUILD + "\\include\\libxml2")
99
) {
1010
EXTENSION("dom", "php_dom.c attr.c document.c \
11+
xml_document.c html_document.c html5_serializer.c html5_parser.c namespace_compat.c \
1112
domexception.c parentnode.c processinginstruction.c \
1213
cdatasection.c documentfragment.c domimplementation.c element.c \
1314
node.c characterdata.c documenttype.c \
1415
entity.c nodelist.c text.c comment.c \
1516
entityreference.c \
1617
notation.c xpath.c dom_iterators.c \
17-
namednodemap.c");
18+
namednodemap.c", null, "-Iext/dom/lexbor");
19+
20+
ADD_SOURCES("ext/dom/lexbor/lexbor/ports/windows_nt/lexbor/core", "memory.c", "dom");
21+
ADD_SOURCES("ext/dom/lexbor/lexbor/core", "array_obj.c array.c avl.c bst.c diyfp.c conv.c dobject.c dtoa.c hash.c mem.c mraw.c print.c serialize.c shs.c str.c strtod.c", "dom");
22+
ADD_SOURCES("ext/dom/lexbor/lexbor/dom", "interface.c", "dom");
23+
ADD_SOURCES("ext/dom/lexbor/lexbor/dom/interfaces", "attr.c cdata_section.c character_data.c comment.c document.c document_fragment.c document_type.c element.c node.c processing_instruction.c shadow_root.c text.c", "dom");
24+
ADD_SOURCES("ext/dom/lexbor/lexbor/html/tokenizer", "error.c state_comment.c state_doctype.c state_rawtext.c state_rcdata.c state_script.c state.c", "dom");
25+
ADD_SOURCES("ext/dom/lexbor/lexbor/html/tree", "active_formatting.c open_elements.c error.c", "dom");
26+
ADD_SOURCES("ext/dom/lexbor/lexbor/html/tree/insertion_mode", "after_after_body.c after_after_frameset.c after_body.c after_frameset.c after_head.c before_head.c before_html.c foreign_content.c in_body.c in_caption.c in_cell.c in_column_group.c in_frameset.c in_head.c in_head_noscript.c initial.c in_row.c in_select.c in_select_in_table.c in_table_body.c in_table.c in_table_text.c in_template.c text.c", "dom");
27+
ADD_SOURCES("ext/dom/lexbor/lexbor/html", "encoding.c interface.c parser.c token.c token_attr.c tokenizer.c tree.c", "dom");
28+
ADD_SOURCES("ext/dom/lexbor/lexbor/encoding", "big5.c decode.c encode.c encoding.c euc_kr.c gb18030.c iso_2022_jp_katakana.c jis0208.c jis0212.c range.c res.c single.c", "dom");
29+
ADD_SOURCES("ext/dom/lexbor/lexbor/html/interfaces", "anchor_element.c area_element.c audio_element.c base_element.c body_element.c br_element.c button_element.c canvas_element.c data_element.c data_list_element.c details_element.c dialog_element.c directory_element.c div_element.c d_list_element.c document.c element.c embed_element.c field_set_element.c font_element.c form_element.c frame_element.c frame_set_element.c head_element.c heading_element.c hr_element.c html_element.c iframe_element.c image_element.c input_element.c label_element.c legend_element.c li_element.c link_element.c map_element.c marquee_element.c media_element.c menu_element.c meta_element.c meter_element.c mod_element.c object_element.c o_list_element.c opt_group_element.c option_element.c output_element.c paragraph_element.c param_element.c picture_element.c pre_element.c progress_element.c quote_element.c script_element.c select_element.c slot_element.c source_element.c span_element.c style_element.c table_caption_element.c table_cell_element.c table_col_element.c table_element.c table_row_element.c table_section_element.c template_element.c text_area_element.c time_element.c title_element.c track_element.c u_list_element.c unknown_element.c video_element.c window.c", "dom");
30+
ADD_SOURCES("ext/dom/lexbor/lexbor/selectors", "selectors.c", "dom");
31+
ADD_SOURCES("ext/dom/lexbor/lexbor/ns", "ns.c", "dom");
32+
ADD_SOURCES("ext/dom/lexbor/lexbor/tag", "tag.c", "dom");
33+
ADD_FLAG("CFLAGS_DOM", "/D LEXBOR_STATIC ");
1834

1935
AC_DEFINE("HAVE_DOM", 1, "DOM support");
2036

0 commit comments

Comments
 (0)