tidyHTML/tidyHTML.py at master · kelleyloder/tidyHTML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#Tidy HTML
#Eitan Goldberger and Kelley Loder

import os
import random
import sys
import re

def copy_to_new_file():
    '''copies content from file to new file w/ .bak extension'''
    file, filename = open_file()
    new_file = open(filename + '.bak', 'w')
    content = file.readlines()
    new_file.write(''.join(content))
    file.close()
    return new_file, content

def open_file():
    '''asks user for file, and open it'''
    filename = raw_input("Enter a file to read: ")
    file = open(filename, 'r')
    return file, filename

def strip_all(content):
    '''strips all whitespace at beginning and end of lines'''
    return [line.strip(' \t') for line in content]

def make_lower(content):
    '''identifies all tags and makes them all lowercase'''
    for line in range(len(content)):
        ndx = 0
        while '<' in content[line][ndx : ]:
            start = content[line].find('<', ndx)
            stop = content[line].find('>', start)
            header_name = content[line][start + 1 :stop]
            content[line] = content[line][: start + 1] + header_name.lower() + content[line][stop : ]
            ndx = stop
    return content

def fix_nesting(content):
    '''fixes nesting errors'''
    line = 0
    start_tags = []
    extra_end_tags = []
    while line < len(content):
        ndx = 0
        while '<' in content[line][ndx : ]:
            start = content[line].find('<', ndx)
            stop = content[line].find('>', ndx)
            if classify(content[line][start : stop + 1]) == 'start':
                start_tags.append({ 'line': line, 'start_ndx' : start, 'stop_ndx' : stop })
            elif classify(content[line][start : stop + 1]) == 'end':
                if start_tag_exists(content, line, start, stop, start_tags):
                    end_tag_content = content[line][start : stop + 1]
                    last_start = start_tags.pop()
                    last_start_content = content[last_start['line']][ last_start['start_ndx'] : last_start['stop_ndx'] + 1 ]
                    if not tags_match(last_start_content, end_tag_content):
                        content[line] = content[line][ : start] + '</' + last_start_content[1 : ] + content[line][start : ]
                        start += len ('</' + last_start_content[1 : ])
                        stop += len ('</' + last_start_content[1 : ])
                else:
                    extra_end_tags.append({'line': line, 'start_ndx': start, 'stop_ndx': stop})
            ndx = stop + 1
        line += 1
    return content, extra_end_tags

def classify(str_tag):
    '''classifies a tag as start, end, or emtpy and returns this as a string'''
    list = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param']
    if str_tag[1] == '/':
        return 'end'
    for type in list:
        if type in str_tag:
            return 'empty'
    return 'start'

def tags_match(start_tag, end_tag):
    '''checks if start and end tag provided are the same, returns true or false'''
    return start_tag[1:-1] == end_tag[2:-1]

def start_tag_exists(content, line, start, stop, start_tags):
    '''checks if end tag has matching start tag in start_tags'''
    end_tag = content[line][start : stop + 1]
    for tag in start_tags:
        start_tag = content[tag['line']][ tag['start_ndx'] : tag['stop_ndx'] + 1 ]
        if tags_match(start_tag, end_tag):
            return True
    return False

def delete_xtra_end_tags(xtra_end_tags, content):
    for tag in reversed(xtra_end_tags):
        content[tag['line']] = content[tag['line']] [ : tag['start_ndx']] + content[tag['line']] [tag['stop_ndx'] + 1 : ]
    return content

def give_tags_new_line(content):
    '''returns new content w/ new lines for start and end tags'''
    line = 0
    while line < len(content):
        ndx = 0
        while '<' in content[line][ndx : ]:
            start = content[line].find('<', ndx)
            stop = content[line].find('>', start)
            type = classify(content[line][start : stop + 1])
            if type == 'start':
                if '</' + content[line][start + 1 : stop] in content[line]:
                    ndx = content[line].find('</' + content[line][start + 1 : stop]) + 1
                else:
                    if start != 0:
                        content[line], new_line = content[line][ : start] + '\n', content[line][start : ]
                        content.insert(line + 1, new_line)
                        ndx = start - 1
                    else:
                        ndx = stop
            elif type == 'end' and start != 0:
                content[line], new_line = content[line][ : start] + '\n', content[line][start : ]
                content.insert(line + 1, new_line)
                ndx = start - 1
            elif type == 'end' and start == 0:
                ndx = stop
            elif type == 'empty':
                ndx = stop
        line += 1
    return content

def delete_blank_lines(content):
    '''deletes any blank lines'''
    line = 0
    while line < len(content):
        temp_line = content[line]
        if temp_line.strip('\t ') == "\n":
            content.pop(line)
        else:
            line += 1
    return content

def add_blank_lines(content):
    '''adds needed blank lines'''
    line = 0
    start_tag_list = ['<head>', '<body>', '<h1>', '<h2>', '<h3>', '<h4>', '<h5>', '<h6>']
    while line < len(content):
        for name in start_tag_list:
            if name in content[line]:
                content.insert(line, '\n')
                line += 1
        line += 1
    return content

def find_tags(content):
    '''finds all the tags in content, and returns their indeces'''
    line = 0
    tags = []
    while line < len(content):
        ndx = 0
        while '<' in content[line][ndx : ]:
            start = content[line].find('<', ndx)
            stop = content[line].find('>', ndx)
            tags.append({'start_ndx': start, 'stop_ndx': stop, 'line':line, \
                         'type': classify(content[line][start : stop + 1])})
            ndx = content[line].find('>', ndx) + 1
        line += 1
    return tags

def same_line_tags(tags, content):
    '''removes from tags list the tags that have start/end on same line'''
    ndx = 0
    while ndx < len(tags):
        sec_tag_text = content[tags[ndx]['line']] [tags[ndx]['start_ndx'] : tags[ndx]['stop_ndx']]
        first_tag_text =  content[tags[ndx - 1]['line']] [tags[ndx - 1]['start_ndx'] : tags[ndx - 1]['stop_ndx']]
        if (tags[ndx]['type'] == 'end' and tags[ndx-1]['type'] == 'start') \
           and (tags_match(first_tag_text, sec_tag_text) \
                and tags[ndx-1]['line'] == tags[ndx]['line']):
            tags.pop(ndx)
            tags.pop(ndx-1)
        else:
            ndx += 1
    return tags

def indent_all_tags(tags, content):
    '''loops through tags, and indents content'''
    ndx = 0
    while ndx < len(tags):
        if tags[ndx]['type'] == 'start':
            line = tags[ndx]['line']
            end_ndx = find_end_tag(tags, ndx, content)
            end_line = tags[end_ndx]['line']
            content, tags = indent_lines(line + 1, end_line - 1, how_many_indents(content[line]) + 1, content, tags)
        ndx += 1
    return content, tags

def find_end_tag(tags, start_tag_ndx, content):
    '''returns the ndx of the end tags in tags that matches start tag'''
    ndx = start_tag_ndx
    start_tag_text = content[tags[ndx]['line']] [tags[ndx]['start_ndx'] : \
    tags[ndx]['stop_ndx'] + 1]
    while (ndx < len(tags)):
        sec_tag_text = content[tags[ndx]['line']][tags[ndx]['start_ndx'] : \
        tags[ndx]['stop_ndx'] + 1]
        if tags[ndx]['type'] == 'end' and tags_match(start_tag_text, sec_tag_text):
            return ndx
        else:
            ndx += 1
    return None

def indent_lines(start_line, last_line, num_indents, content, tags):
    '''indents content from content[start_line : last_line] num_indents indentations'''
    for line in range(start_line, last_line + 1):
        content[line] = '\t' * num_indents + content[line].lstrip('\t ')
        for tag in filter(lambda tg: tg['line'] == line, tags):
            tag['start_ndx'] += 1
            tag['stop_ndx'] += 1
    return content,tags

def how_many_indents(line):
    '''returns how many \ts there are at beginning of line'''
    return len(line) - len(line.lstrip('\t'))

def line_length(content):
    '''ensures all lines are 80 characters or less'''
    line = 0
    while line < len(content):
        if len(content[line]) > 80:
            num_indents = how_many_indents(content[line])
            space_ndx = find_first_space_before_80(content[line])
            if space_ndx != -1:
                content[line], new_line = content[line][ : space_ndx] + '\n', content[line][space_ndx + 1: ]
                new_line = '\t' * num_indents + new_line
                content.insert(line + 1, new_line)
            else:
                line += 1
        else:
            line += 1
    return content

def find_first_space_before_80(line):
    '''finds first space before line is at 80 characters and returns the ndx'''
    ndx = 79
    while ndx > 0:
        ndx -= 1
        if line[ndx] == ' ':
            return ndx
    return -1

def create_output_file(content):
    '''uses random int generator to create new output file'''
    file_name = str(random.randint(1, sys.maxint)) + '.html'
    new_file = open(file_name, 'w')
    new_file.write(''.join(content))
    new_file.close()

def main():
    nf, content = copy_to_new_file()
    content = strip_all(content)
    content = make_lower(content)
    content, xtra = fix_nesting(content)
    content = delete_xtra_end_tags(xtra, content)
    content = give_tags_new_line(content)
    content = delete_blank_lines(content)
    content = add_blank_lines(content)
    tags = find_tags(content)
    tags = same_line_tags(tags, content)
    content, tags = indent_all_tags(tags, content)
    content = line_length(content)
    create_output_file(content)

if (__name__ == '__main__'):
    main()