gdrivesync/lambda_function.py at main · Atbash-Labs/gdrivesync · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import json
import os
import re
import urllib.request
import urllib.error
from datetime import datetime
import boto3

# Google Docs to sync (document IDs extracted from URLs)
GOOGLE_DOC_IDS = [
    "1juZcZg0ygOesa4LLWOuULUu501BGaBWW3dtvQZQzJr8",
    "1-QCHQaGe44fB4w4OkAwYdcZFW1X8nAjwj2jbNbFf5lg",
    "15kZEyxk1EaEx2yJ-CghO8ZYHcgVCjmG9LXaMAAkbVQQ",
    "1bGJlcbhu7CGs6D6DqP831jL0EnoGQQa_l68m9J1s-20",
]

# S3 bucket name
S3_BUCKET = os.environ.get("S3_BUCKET_NAME", "axilon-rag-mistakes")


def postprocess_content(content: str) -> str:
    """
    Post-process document content:
    - Remove lines that start with 'at ' (case-insensitive)
    - These lines typically contain timestamps/metadata after item entries
    """
    lines = content.split('\n')
    processed_lines = []

    for line in lines:
        stripped = line.strip()
        # Skip lines that start with 'at ' (case-insensitive)
        if re.match(r'^at\s', stripped, re.IGNORECASE):
            continue
        processed_lines.append(line)

    return '\n'.join(processed_lines)


def get_google_doc_content(doc_id: str) -> str:
    """
    Fetch the plain text content of a public Google Doc.
    Uses the export endpoint which works for publicly shared documents.
    """
    export_url = f"https://docs.google.com/document/d/{doc_id}/export?format=txt"

    try:
        request = urllib.request.Request(
            export_url,
            headers={"User-Agent": "Mozilla/5.0 (compatible; GDriveSync/1.0)"},
        )
        with urllib.request.urlopen(request, timeout=30) as response:
            content = response.read().decode("utf-8")
            return content
    except urllib.error.HTTPError as e:
        raise Exception(f"Failed to fetch doc {doc_id}: HTTP {e.code} - {e.reason}")
    except urllib.error.URLError as e:
        raise Exception(f"Failed to fetch doc {doc_id}: {e.reason}")


def upload_to_s3(s3_client, bucket: str, key: str, content: str) -> dict:
    """Upload text content to S3 bucket."""
    s3_client.put_object(
        Bucket=bucket,
        Key=key,
        Body=content.encode("utf-8"),
        ContentType="text/plain; charset=utf-8",
    )
    return {"bucket": bucket, "key": key, "size": len(content)}


def lambda_handler(event, context):
    """
    Main Lambda handler.
    Reads all configured Google Docs and syncs them to S3.
    """
    s3_client = boto3.client("s3")
    timestamp = datetime.utcnow().isoformat()

    results = {
        "timestamp": timestamp,
        "bucket": S3_BUCKET,
        "documents": [],
        "errors": [],
    }

    for doc_id in GOOGLE_DOC_IDS:
        try:
            print(f"Fetching document: {doc_id}")
            content = get_google_doc_content(doc_id)

            # Post-process: remove lines starting with 'at '
            content = postprocess_content(content)

            # Use document ID as filename
            s3_key = f"docs/{doc_id}.txt"

            upload_result = upload_to_s3(s3_client, S3_BUCKET, s3_key, content)
            upload_result["doc_id"] = doc_id
            results["documents"].append(upload_result)

            print(f"Successfully synced {doc_id} -> s3://{S3_BUCKET}/{s3_key}")

        except Exception as e:
            error_msg = str(e)
            print(f"Error processing {doc_id}: {error_msg}")
            results["errors"].append({"doc_id": doc_id, "error": error_msg})

    # Also save a manifest/metadata file
    manifest_key = "docs/_manifest.json"
    manifest_content = json.dumps(results, indent=2)
    upload_to_s3(s3_client, S3_BUCKET, manifest_key, manifest_content)

    return {
        "statusCode": 200 if not results["errors"] else 207,
        "body": json.dumps(results, indent=2),
    }


# For local testing
if __name__ == "__main__":
    # Test fetching docs locally (won't upload to S3)
    for doc_id in GOOGLE_DOC_IDS:
        try:
            raw_content = get_google_doc_content(doc_id)
            processed_content = postprocess_content(raw_content)
            print(f"\n--- Document {doc_id} ---")
            print(f"Raw length: {len(raw_content)} chars, Processed: {len(processed_content)} chars")
            print(f"Preview:\n{processed_content[:500]}...")
        except Exception as e:
            print(f"Error fetching {doc_id}: {e}")