-
Notifications
You must be signed in to change notification settings - Fork 18
/
curl-output-html-parser.py
35 lines (32 loc) · 1.22 KB
/
curl-output-html-parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from lxml import html, etree
import sys
"""
read data from stdin,
interpret it like full html text,
print xpath from first command-line argument
usage example
```
curl -X GET google.com | python3 curl-output-html-parser.py "/html/head/title"
curl -X GET google.com | python3 curl-output-html-parser.py "/html/body/a/@href"
curl --cookie cookie.txt --silent "http://loveread.books/read_book.php?id=66258&p=100" | iconv --from-code WINDOWS-1251 --to-code UTF-8 | python3 curl-output-html-parser.py "/html/body/table/tr[2]/td/table/tr/td[2]/div[3]"
```
"""
# read all lines from stdin
lines = [each_line for each_line in sys.stdin]
# parse input data as html file
tree = html.fromstring("\n".join(lines))
if len(sys.argv)==0:
elements = tree.xpath("/html")
else:
elements = tree.xpath(sys.argv[1])
if len(elements)>0:
# print( str(etree.tostring(elements[0])) )
# print("".join([ str(child.text) for child in elements[0].iterdescendants()]))
# print("".join([str(etree.tostring(child)) for child in elements[0].iterchildren()]))
if hasattr(elements[0], "itertext"):
print("\n".join([text.strip() for text in elements[0].itertext()]))
else:
print(elements[0])
sys.exit(0)
else:
sys.exit(1)