Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

"""A parser for HTML and XHTML.""" 

 

# This file is based on sgmllib.py, but the API is slightly different. 

 

# XXX There should be a way to distinguish between PCDATA (parsed 

# character data -- the normal case), RCDATA (replaceable character 

# data -- only char and entity references and end tags are special) 

# and CDATA (character data -- only end tags are special). 

 

 

import re 

import warnings 

import _markupbase 

 

from html import unescape 

 

 

__all__ = ['HTMLParser'] 

 

# Regular expressions used for parsing 

 

interesting_normal = re.compile('[&<]') 

incomplete = re.compile('&[a-zA-Z#]') 

 

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 

charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 

 

starttagopen = re.compile('<[a-zA-Z]') 

piclose = re.compile('>') 

commentclose = re.compile(r'--\s*>') 

# Note: 

# 1) if you change tagfind/attrfind remember to update locatestarttagend too; 

# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will 

# explode, so don't do it. 

# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 

# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 

tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') 

attrfind_tolerant = re.compile( 

r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 

r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 

locatestarttagend_tolerant = re.compile(r""" 

<[a-zA-Z][^\t\n\r\f />\x00]* # tag name 

(?:[\s/]* # optional whitespace before attribute name 

(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 

(?:\s*=+\s* # value indicator 

(?:'[^']*' # LITA-enclosed value 

|"[^"]*" # LIT-enclosed value 

|(?!['"])[^>\s]* # bare value 

) 

(?:\s*,)* # possibly followed by a comma 

)?(?:\s|/(?!>))* 

)* 

)? 

\s* # trailing whitespace 

""", re.VERBOSE) 

endendtag = re.compile('>') 

# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 

# </ and the tag name, so maybe this should be fixed 

endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 

 

 

 

class HTMLParser(_markupbase.ParserBase): 

"""Find tags and other markup and call handler functions. 

 

Usage: 

p = HTMLParser() 

p.feed(data) 

... 

p.close() 

 

Start tags are handled by calling self.handle_starttag() or 

self.handle_startendtag(); end tags by self.handle_endtag(). The 

data between tags is passed from the parser to the derived class 

by calling self.handle_data() with the data as argument (the data 

may be split up in arbitrary chunks). If convert_charrefs is 

True the character references are converted automatically to the 

corresponding Unicode character (and self.handle_data() is no 

longer split in chunks), otherwise they are passed by calling 

self.handle_entityref() or self.handle_charref() with the string 

containing respectively the named or numeric reference as the 

argument. 

""" 

 

CDATA_CONTENT_ELEMENTS = ("script", "style") 

 

def __init__(self, *, convert_charrefs=True): 

"""Initialize and reset this instance. 

 

If convert_charrefs is True (the default), all character references 

are automatically converted to the corresponding Unicode characters. 

""" 

self.convert_charrefs = convert_charrefs 

self.reset() 

 

def reset(self): 

"""Reset this instance. Loses all unprocessed data.""" 

self.rawdata = '' 

self.lasttag = '???' 

self.interesting = interesting_normal 

self.cdata_elem = None 

_markupbase.ParserBase.reset(self) 

 

def feed(self, data): 

r"""Feed data to the parser. 

 

Call this as often as you want, with as little or as much text 

as you want (may include '\n'). 

""" 

self.rawdata = self.rawdata + data 

self.goahead(0) 

 

def close(self): 

"""Handle any buffered data.""" 

self.goahead(1) 

 

__starttag_text = None 

 

def get_starttag_text(self): 

"""Return full source of start tag: '<...>'.""" 

return self.__starttag_text 

 

def set_cdata_mode(self, elem): 

self.cdata_elem = elem.lower() 

self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 

 

def clear_cdata_mode(self): 

self.interesting = interesting_normal 

self.cdata_elem = None 

 

# Internal -- handle data as far as reasonable. May leave state 

# and data to be processed by a subsequent call. If 'end' is 

# true, force handling all data as if followed by EOF marker. 

def goahead(self, end): 

rawdata = self.rawdata 

i = 0 

n = len(rawdata) 

while i < n: 

if self.convert_charrefs and not self.cdata_elem: 

j = rawdata.find('<', i) 

if j < 0: 

# if we can't find the next <, either we are at the end 

# or there's more text incoming. If the latter is True, 

# we can't pass the text to handle_data in case we have 

# a charref cut in half at end. Try to determine if 

# this is the case before proceeding by looking for an 

# & near the end and see if it's followed by a space or ;. 

amppos = rawdata.rfind('&', max(i, n-34)) 

if (amppos >= 0 and 

not re.compile(r'[\s;]').search(rawdata, amppos)): 

break # wait till we get all the text 

j = n 

else: 

match = self.interesting.search(rawdata, i) # < or & 

if match: 

j = match.start() 

else: 

if self.cdata_elem: 

break 

j = n 

if i < j: 

if self.convert_charrefs and not self.cdata_elem: 

self.handle_data(unescape(rawdata[i:j])) 

else: 

self.handle_data(rawdata[i:j]) 

i = self.updatepos(i, j) 

if i == n: break 

startswith = rawdata.startswith 

if startswith('<', i): 

if starttagopen.match(rawdata, i): # < + letter 

k = self.parse_starttag(i) 

elif startswith("</", i): 

k = self.parse_endtag(i) 

elif startswith("<!--", i): 

k = self.parse_comment(i) 

elif startswith("<?", i): 

k = self.parse_pi(i) 

elif startswith("<!", i): 

k = self.parse_html_declaration(i) 

elif (i + 1) < n: 

self.handle_data("<") 

k = i + 1 

else: 

break 

if k < 0: 

if not end: 

break 

k = rawdata.find('>', i + 1) 

if k < 0: 

k = rawdata.find('<', i + 1) 

if k < 0: 

k = i + 1 

else: 

k += 1 

if self.convert_charrefs and not self.cdata_elem: 

self.handle_data(unescape(rawdata[i:k])) 

else: 

self.handle_data(rawdata[i:k]) 

i = self.updatepos(i, k) 

elif startswith("&#", i): 

match = charref.match(rawdata, i) 

if match: 

name = match.group()[2:-1] 

self.handle_charref(name) 

k = match.end() 

if not startswith(';', k-1): 

k = k - 1 

i = self.updatepos(i, k) 

continue 

else: 

if ";" in rawdata[i:]: # bail by consuming &# 

self.handle_data(rawdata[i:i+2]) 

i = self.updatepos(i, i+2) 

break 

elif startswith('&', i): 

match = entityref.match(rawdata, i) 

if match: 

name = match.group(1) 

self.handle_entityref(name) 

k = match.end() 

if not startswith(';', k-1): 

k = k - 1 

i = self.updatepos(i, k) 

continue 

match = incomplete.match(rawdata, i) 

if match: 

# match.group() will contain at least 2 chars 

if end and match.group() == rawdata[i:]: 

k = match.end() 

if k <= i: 

k = n 

i = self.updatepos(i, i + 1) 

# incomplete 

break 

elif (i + 1) < n: 

# not the end of the buffer, and can't be confused 

# with some other construct 

self.handle_data("&") 

i = self.updatepos(i, i + 1) 

else: 

break 

else: 

assert 0, "interesting.search() lied" 

# end while 

if end and i < n and not self.cdata_elem: 

if self.convert_charrefs and not self.cdata_elem: 

self.handle_data(unescape(rawdata[i:n])) 

else: 

self.handle_data(rawdata[i:n]) 

i = self.updatepos(i, n) 

self.rawdata = rawdata[i:] 

 

# Internal -- parse html declarations, return length or -1 if not terminated 

# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 

# See also parse_declaration in _markupbase 

def parse_html_declaration(self, i): 

rawdata = self.rawdata 

assert rawdata[i:i+2] == '<!', ('unexpected call to ' 

'parse_html_declaration()') 

if rawdata[i:i+4] == '<!--': 

# this case is actually already handled in goahead() 

return self.parse_comment(i) 

elif rawdata[i:i+3] == '<![': 

return self.parse_marked_section(i) 

elif rawdata[i:i+9].lower() == '<!doctype': 

# find the closing > 

gtpos = rawdata.find('>', i+9) 

if gtpos == -1: 

return -1 

self.handle_decl(rawdata[i+2:gtpos]) 

return gtpos+1 

else: 

return self.parse_bogus_comment(i) 

 

# Internal -- parse bogus comment, return length or -1 if not terminated 

# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 

def parse_bogus_comment(self, i, report=1): 

rawdata = self.rawdata 

assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' 

'parse_comment()') 

pos = rawdata.find('>', i+2) 

if pos == -1: 

return -1 

if report: 

self.handle_comment(rawdata[i+2:pos]) 

return pos + 1 

 

# Internal -- parse processing instr, return end or -1 if not terminated 

def parse_pi(self, i): 

rawdata = self.rawdata 

assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 

match = piclose.search(rawdata, i+2) # > 

if not match: 

return -1 

j = match.start() 

self.handle_pi(rawdata[i+2: j]) 

j = match.end() 

return j 

 

# Internal -- handle starttag, return end or -1 if not terminated 

def parse_starttag(self, i): 

self.__starttag_text = None 

endpos = self.check_for_whole_start_tag(i) 

if endpos < 0: 

return endpos 

rawdata = self.rawdata 

self.__starttag_text = rawdata[i:endpos] 

 

# Now parse the data between i+1 and j into a tag and attrs 

attrs = [] 

match = tagfind_tolerant.match(rawdata, i+1) 

assert match, 'unexpected call to parse_starttag()' 

k = match.end() 

self.lasttag = tag = match.group(1).lower() 

while k < endpos: 

m = attrfind_tolerant.match(rawdata, k) 

if not m: 

break 

attrname, rest, attrvalue = m.group(1, 2, 3) 

if not rest: 

attrvalue = None 

elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 

attrvalue[:1] == '"' == attrvalue[-1:]: 

attrvalue = attrvalue[1:-1] 

if attrvalue: 

attrvalue = unescape(attrvalue) 

attrs.append((attrname.lower(), attrvalue)) 

k = m.end() 

 

end = rawdata[k:endpos].strip() 

if end not in (">", "/>"): 

lineno, offset = self.getpos() 

if "\n" in self.__starttag_text: 

lineno = lineno + self.__starttag_text.count("\n") 

offset = len(self.__starttag_text) \ 

- self.__starttag_text.rfind("\n") 

else: 

offset = offset + len(self.__starttag_text) 

self.handle_data(rawdata[i:endpos]) 

return endpos 

if end.endswith('/>'): 

# XHTML-style empty tag: <span attr="value" /> 

self.handle_startendtag(tag, attrs) 

else: 

self.handle_starttag(tag, attrs) 

if tag in self.CDATA_CONTENT_ELEMENTS: 

self.set_cdata_mode(tag) 

return endpos 

 

# Internal -- check to see if we have a complete starttag; return end 

# or -1 if incomplete. 

def check_for_whole_start_tag(self, i): 

rawdata = self.rawdata 

m = locatestarttagend_tolerant.match(rawdata, i) 

if m: 

j = m.end() 

next = rawdata[j:j+1] 

if next == ">": 

return j + 1 

if next == "/": 

if rawdata.startswith("/>", j): 

return j + 2 

if rawdata.startswith("/", j): 

# buffer boundary 

return -1 

# else bogus input 

if j > i: 

return j 

else: 

return i + 1 

if next == "": 

# end of input 

return -1 

if next in ("abcdefghijklmnopqrstuvwxyz=/" 

"ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 

# end of input in or before attribute value, or we have the 

# '/' from a '/>' ending 

return -1 

if j > i: 

return j 

else: 

return i + 1 

raise AssertionError("we should not get here!") 

 

# Internal -- parse endtag, return end or -1 if incomplete 

def parse_endtag(self, i): 

rawdata = self.rawdata 

assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 

match = endendtag.search(rawdata, i+1) # > 

if not match: 

return -1 

gtpos = match.end() 

match = endtagfind.match(rawdata, i) # </ + tag + > 

if not match: 

if self.cdata_elem is not None: 

self.handle_data(rawdata[i:gtpos]) 

return gtpos 

# find the name: w3.org/TR/html5/tokenization.html#tag-name-state 

namematch = tagfind_tolerant.match(rawdata, i+2) 

if not namematch: 

# w3.org/TR/html5/tokenization.html#end-tag-open-state 

if rawdata[i:i+3] == '</>': 

return i+3 

else: 

return self.parse_bogus_comment(i) 

tagname = namematch.group(1).lower() 

# consume and ignore other stuff between the name and the > 

# Note: this is not 100% correct, since we might have things like 

# </tag attr=">">, but looking for > after tha name should cover 

# most of the cases and is much simpler 

gtpos = rawdata.find('>', namematch.end()) 

self.handle_endtag(tagname) 

return gtpos+1 

 

elem = match.group(1).lower() # script or style 

if self.cdata_elem is not None: 

if elem != self.cdata_elem: 

self.handle_data(rawdata[i:gtpos]) 

return gtpos 

 

self.handle_endtag(elem.lower()) 

self.clear_cdata_mode() 

return gtpos 

 

# Overridable -- finish processing of start+end tag: <tag.../> 

def handle_startendtag(self, tag, attrs): 

self.handle_starttag(tag, attrs) 

self.handle_endtag(tag) 

 

# Overridable -- handle start tag 

def handle_starttag(self, tag, attrs): 

pass 

 

# Overridable -- handle end tag 

def handle_endtag(self, tag): 

pass 

 

# Overridable -- handle character reference 

def handle_charref(self, name): 

pass 

 

# Overridable -- handle entity reference 

def handle_entityref(self, name): 

pass 

 

# Overridable -- handle data 

def handle_data(self, data): 

pass 

 

# Overridable -- handle comment 

def handle_comment(self, data): 

pass 

 

# Overridable -- handle declaration 

def handle_decl(self, decl): 

pass 

 

# Overridable -- handle processing instruction 

def handle_pi(self, data): 

pass 

 

def unknown_decl(self, data): 

pass 

 

# Internal -- helper to remove special character quoting 

def unescape(self, s): 

warnings.warn('The unescape method is deprecated and will be removed ' 

'in 3.5, use html.unescape() instead.', 

DeprecationWarning, stacklevel=2) 

return unescape(s)