academictorrents_uploader/torrent.py at master · academictorrents/academictorrents_uploader · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
#!/usr/bin/env python3
"""
Copyright (C) 2010-2013 Robert Nitsch
Licensed according to GPL v3.
"""

import datetime
import hashlib
import math
import optparse
import os
import re
import sys
import time

from py3bencode import bencode

__all__ = ['calculate_piece_length',
           'get_files_in_directory',
           'sha1_20',
           'split_path']

# #############
# CONFIGURATION

# do not touch anything below this line unless you know what you're doing!


VERSION =   '0.9.5'

# Note:
#  Kilobyte = kB  = 1000 Bytes
#  Kibibyte = KiB = 1024 Bytes  << used by py3createtorrent
KIB = 2**10
MIB = KIB * KIB


def sha1_20(data):
    """Return the first 20 bytes of the given data's SHA-1 hash."""
    m = hashlib.sha1()
    m.update(data)
    return m.digest()[:20]

def create_single_file_info(file, piece_length, include_md5=True):
    """
    Return dictionary with the following keys:
      - pieces: concatenated 20-byte-sha1-hashes
      - name:   basename of the file
      - length: size of the file in bytes
      - md5sum: md5sum of the file (unless disabled via include_md5)

    @see:   BitTorrent Metainfo Specification.
    @note:  md5 hashes in torrents are actually optional
    """
    assert os.path.isfile(file), "not a file"

    # Total byte count.
    length = 0

    # Concatenated 20byte sha1-hashes of all the file's pieces.
    pieces = bytearray()

    md5 = hashlib.md5() if include_md5 else None

    with open(file, "rb") as fh:
        while True:
            piece_data = fh.read(piece_length)

            _len = len(piece_data)
            if _len == 0:
                break

            if include_md5:
                md5.update(piece_data)

            length += _len

            pieces += sha1_20(piece_data)

    assert length > 0, "empty file"

    info =  {
            'pieces': pieces,
            'name':   os.path.basename(file),
            'length': length,

            }

    if include_md5:
        info['md5sum'] = md5.hexdigest()

    return info

def create_multi_file_info(directory,
                           files,
                           piece_length,
                           include_md5=True):
    """
    Return dictionary with the following keys:
      - pieces: concatenated 20-byte-sha1-hashes
      - name:   basename of the directory (default name of all torrents)
      - files:  a list of dictionaries with the following keys:
        - length: size of the file in bytes
        - md5sum: md5 sum of the file (unless disabled via include_md5)
        - path:   path to the file, relative to the initial directory,
                  given as list.
                  Examples:
                  -> ["dir1", "dir2", "file.ext"]
                  -> ["just_in_the_initial_directory_itself.ext"]

    @see:   BitTorrent Metainfo Specification.
    @note:  md5 hashes in torrents are actually optional
    """
    assert os.path.isdir(directory), "not a directory"

    # Concatenated 20byte sha1-hashes of all the torrent's pieces.
    info_pieces = bytearray()

    #
    info_files = []

    # This bytearray will be used for the calculation of info_pieces.
    # At some point, every file's data will be written into data. Consecutive
    # files will be written into data as a continuous stream, as required
    # by info_pieces' BitTorrent specification.
    data = bytearray()

    for file in files:
        path = os.path.join(directory, file)

        # File's byte count.
        length = 0

        # File's md5sum.
        md5 = hashlib.md5() if include_md5 else None

        with open(path, "rb") as fh:
            while True:
                filedata = fh.read(piece_length)

                if len(filedata) == 0:
                    break

                length += len(filedata)

                data += filedata

                if len(data) >= piece_length:
                    info_pieces  +=  sha1_20(data[:piece_length])
                    data          =  data[piece_length:]

                if include_md5:
                    md5.update(filedata)

        # Build the current file's dictionary.
        fdict = {
                'length': length,
                'path':   split_path(file)
                }

        if include_md5:
            fdict['md5sum'] = md5.hexdigest()

        info_files.append(fdict)

    # Don't forget to hash the last piece.
    # (Probably the piece that has not reached the regular piece size.)
    if len(data) > 0:
        info_pieces += sha1_20(data)

    # Build the final dictionary.
    info = {
           'pieces': info_pieces,
           'name':   os.path.basename(directory.strip("/\\")),
           'files':  info_files
           }

    return info

def get_files_in_directory(directory,
                           excluded_paths=set(),
                           relative_to=None,
                           excluded_regexps=set()):
    """
    Return a list containing the paths to all files in the given directory.

    Paths in excluded_paths are skipped. These should be os.path.normcase()-d.
    Of course, the initial directory cannot be excluded.
    Paths matching any of the regular expressions in excluded_regexps are
    skipped, too. The regexps must be compiled by the caller.
    In both cases, absolute paths are used for matching.

    The paths may be returned relative to a specific directory. By default,
    this is the initial directory itself.

    Please note: Only paths to files are returned!

    @param excluded_regexps: A set or frozenset of compiled regular expressions.
    """
    # Argument validation:
    if not isinstance(directory, str):
        raise TypeError("directory must be instance of: str")

    if not isinstance(excluded_paths, (set, frozenset)):
        raise TypeError("excluded_paths must be instance of: set or frozenset")

    if relative_to is not None:
        if not isinstance(relative_to, str):
            raise TypeError("relative_to must be instance of: str")

        if not os.path.isdir(relative_to):
            raise ValueError("relative_to: '%s' is not a valid directory" %
                             (relative_to))

    if not isinstance(excluded_regexps, (set, frozenset)):
        raise TypeError("excluded_regexps must be instance of: set or frozenset")

    # Helper function:
    def _get_files_in_directory(directory,
                                files,
                                excluded_paths=set(),
                                relative_to=None,
                                excluded_regexps=set(),
                                processed_paths=set()):
        # Improve consistency across platforms.
        # Note:
        listdir = os.listdir(directory)
        listdir.sort(key=str.lower)

        processed_paths.add(os.path.normcase(os.path.realpath(directory)))

        for node in listdir:
            path = os.path.join(directory, node)

            if os.path.normcase(path) in excluded_paths:
                continue

            regexp_match = False
            for regexp in excluded_regexps:
                if regexp.search(path):
                    regexp_match = True
                    break
            if regexp_match:
                continue

            if os.path.normcase(os.path.realpath(path)) in processed_paths:
                print("Warning: skipping symlink '%s', because it's target "
                      "has already been processed." % path, file=sys.stderr)
                continue
            processed_paths.add(os.path.normcase(os.path.realpath(path)))

            if os.path.isfile(path):
                if relative_to:
                    path = os.path.relpath(path, relative_to)
                files.append(path)
            elif os.path.isdir(path):
                _get_files_in_directory(path,
                                        files,
                                        excluded_paths=excluded_paths,
                                        relative_to=relative_to,
                                        excluded_regexps=excluded_regexps,
                                        processed_paths=processed_paths)
            else:
                assert False, "not a valid node: '%s'" % node

        return files

    # Final preparations:
    directory = os.path.abspath(directory)

    if not relative_to:
        relative_to = directory

    # Now do the main work.
    files = _get_files_in_directory(directory,
                                    list(),
                                    excluded_paths=excluded_paths,
                                    relative_to=relative_to,
                                    excluded_regexps=excluded_regexps)

    return files

def split_path(path):
    """
    Return a list containing all of a path's components.

    Paths containing relative components get resolved first.

    >>> split_path("this/./is/a/very/../fucked\\path/file.ext")
    ['this', 'is', 'a', 'fucked', 'path', 'file.ext']
    """
    if not isinstance(path, str):
        raise TypeError("path must be instance of: str")

    parts = []

    path = os.path.normpath(path)

    head = path

    while len(head) != 0:
        (head, tail) = os.path.split(head)
        parts.insert(0, tail)

    return parts

def remove_duplicates(old_list):
    """
    Remove any duplicates in old_list, preserving the order of its
    elements.

    Thus, for all duplicate entries only the first entry is kept in
    the list.
    """
    new_list = list()
    added_items = set()

    for item in old_list:
        if item in added_items:
            continue

        added_items.add(item)
        new_list.append(item)

    return new_list

def replace_in_list(old_list, replacements):
    """
    Replace specific items in a list.

    Note that one element may be replaced by multiple new elements.
    However, this also makes it impossible to replace an item with a
    list.

    Example given:
    >>> replace_in_list(['dont',
                         'replace',
                         'us',
                         'replace me'],
                        {'replace me': ['you',
                                        'are',
                                        'welcome']})
    ['dont', 'replace', 'us', 'you', 'are', 'welcome']
    """
    new_list = list()

    replacements_from = set(replacements.keys())

    for item in old_list:
        if item in replacements_from:
            new_item = replacements[item]

            if isinstance(new_item, list):
                new_list.extend(new_item)
            else:
                new_list.append(new_item)
        else:
            new_list.append(item)

    return new_list

def calculate_piece_length(size):
    """
    Calculate a reasonable piece length for the given torrent size.

    Proceeding:
    1. Start with 256 KIB.
    2. While piece count > 2000: double piece length.
    3. While piece count < 8:    use half the piece length.

    However, enforce these bounds:
    - minimum piece length = 16 KiB.
    - maximum piece length =  1 MiB.
    """
    if not isinstance(size, int):
        raise TypeError("size must be instance of: int")

    if size <= 0:
        raise ValueError("size must be greater than 0 (given: %d)" % size)

    if size < 16 * KIB:
        return 16 * KIB

    piece_length = 256 * KIB

    while size / piece_length > 2000:
        piece_length *= 2

    while size / piece_length < 8:
        piece_length /= 2

    # Ensure that: 16 KIB <= piece_length <= 1 * MIB
    piece_length = max(min(piece_length, 1 * MIB), 16 * KIB)

    return int(piece_length)

def make_torrent(node):
    # CALCULATE/SET THE FOLLOWING METAINFO DATA:
    # - info
    #   - pieces (concatenated 20 byte sha1 hashes of all the data)
    #   - files (if multiple files)
    #   - length and md5sum (if single file)
    #   - name (may be overwritten in the next section by the --name option)

    node = os.path.abspath(node)
    # Validate the given path.
    if not os.path.isfile(node) and not os.path.isdir(node):
        raise Exception("'%s' neither is a file nor a directory." % node)

    # Get the torrent's files and / or calculate its size.
    if os.path.isfile(node):
        torrent_size = os.path.getsize(node)
    else:
        torrent_files = get_files_in_directory(node)
        torrent_size = sum([os.path.getsize(os.path.join(node, file))
                            for file in torrent_files])

    # Torrents for 0 byte data can't be created.
    if torrent_size == 0:
        raise Exception("No data for torrent.")

    piece_length = calculate_piece_length(torrent_size)

    # Do the main work now.
    # -> prepare the metainfo dictionary.
    if os.path.isfile(node):
        info = create_single_file_info(node, piece_length)
    else:
        info = create_multi_file_info(node, torrent_files, piece_length)
    info['piece length'] = piece_length

    # Finish sub-dict "info".

    # Construct outer metainfo dict, which contains the torrent's whole
    # information.
    metainfo =  {
        'info': info,
        'announce': 'http://academictorrents.com/announce.php',
        'creation date': int(time.time()),
        'created by': '',
    }

    # ###################################################
    # BENCODE METAINFO DICTIONARY AND WRITE TORRENT FILE:
    # - properly handle KeyboardInterrups while writing the file

    # Use current directory.
    output_path = metainfo['info']['name'] + ".torrent"

    # Actually write the torrent file now.
    try:
        with open(output_path, "wb") as fh:
            fh.write(bencode(metainfo))
    except IOError as exc:
        print("IOError: " + str(exc), file=sys.stderr)
        print("Could not write the torrent file. Check torrent name and your "
              "privileges.", file=sys.stderr)
        return 1
    except KeyboardInterrupt:
        # Properly handle KeyboardInterrupts.
        # todo: open()'s context manager may already do this on his own?
        if os.path.exists(output_path):
            os.remove(output_path)
    return output_path