import random
import string
import os

OUTPUT_FILE = "large_test_file_2-5gb.txt"
FILE_SIZE_IN_GB = 2.5

TARGET_SIZE_BYTES = int(FILE_SIZE_IN_GB * pow(1024, 3))
CHUNK_SIZE = pow(1024, 2)
MIN_WORD_LEN = 3
MAX_WORD_LEN = 12
WORDS_PER_LINE = 1000

def generate_chunk(target_bytes):
    lines = []
    size = 0

    while size < target_bytes:
        line_words = (''.join(random.choices(string.ascii_lowercase, k=random.randint(MIN_WORD_LEN, MAX_WORD_LEN))) for _ in range(WORDS_PER_LINE))
        line = " ".join(line_words) + "\n"
        lines.append(line)
        size += len(line)

    return "".join(lines)

def create_file():
    print("Program started!")
    written = 0

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        while written < TARGET_SIZE_BYTES:
            remaining = TARGET_SIZE_BYTES - written
            chunk_size = min(CHUNK_SIZE, remaining)

            chunk = generate_chunk(chunk_size)
            f.write(chunk)
            written += len(chunk.encode("utf-8"))

            if written % (100 * 1024 * 1024) < CHUNK_SIZE:
                print(f"Written: {written / (1024**3):.2f} GB")

    print("Program done!")
    print("Final size:", os.path.getsize(OUTPUT_FILE))

create_file()
