|
2 | 2 | import math |
3 | 3 | import time |
4 | 4 |
|
5 | | -import pytest |
6 | | - |
7 | 5 | log = logging.getLogger(__name__) |
8 | 6 |
|
9 | 7 |
|
10 | | -class LargeQueriesFetchMixin: |
11 | | - """Shared fetch helper for large query test classes.""" |
12 | | - |
13 | | - def fetch_rows(self, cursor, row_count, fetchmany_size): |
14 | | - """ |
15 | | - A generator for rows. Fetches until the end or up to 5 minutes. |
16 | | - """ |
17 | | - # TODO: Remove fetchmany_size when we have fixed the performance issues with fetchone |
18 | | - # in the Python client |
19 | | - max_fetch_time = 5 * 60 # Fetch for at most 5 minutes |
20 | | - |
21 | | - rows = self.get_some_rows(cursor, fetchmany_size) |
22 | | - start_time = time.time() |
23 | | - n = 0 |
24 | | - while rows: |
25 | | - for row in rows: |
26 | | - n += 1 |
27 | | - yield row |
28 | | - if time.time() - start_time >= max_fetch_time: |
29 | | - log.warning("Fetching rows timed out") |
30 | | - break |
31 | | - rows = self.get_some_rows(cursor, fetchmany_size) |
32 | | - if not rows: |
33 | | - # Read all the rows, row_count should match |
34 | | - self.assertEqual(n, row_count) |
35 | | - |
36 | | - num_fetches = max(math.ceil(n / 10000), 1) |
37 | | - latency_ms = int((time.time() - start_time) * 1000 / num_fetches), 1 |
38 | | - print( |
39 | | - "Fetched {} rows with an avg latency of {} per fetch, ".format( |
40 | | - n, latency_ms |
41 | | - ) |
42 | | - + "assuming 10K fetch size." |
| 8 | +def fetch_rows(test_case, cursor, row_count, fetchmany_size): |
| 9 | + """ |
| 10 | + A generator for rows. Fetches until the end or up to 5 minutes. |
| 11 | + """ |
| 12 | + max_fetch_time = 5 * 60 # Fetch for at most 5 minutes |
| 13 | + |
| 14 | + rows = _get_some_rows(cursor, fetchmany_size) |
| 15 | + start_time = time.time() |
| 16 | + n = 0 |
| 17 | + while rows: |
| 18 | + for row in rows: |
| 19 | + n += 1 |
| 20 | + yield row |
| 21 | + if time.time() - start_time >= max_fetch_time: |
| 22 | + log.warning("Fetching rows timed out") |
| 23 | + break |
| 24 | + rows = _get_some_rows(cursor, fetchmany_size) |
| 25 | + if not rows: |
| 26 | + # Read all the rows, row_count should match |
| 27 | + test_case.assertEqual(n, row_count) |
| 28 | + |
| 29 | + num_fetches = max(math.ceil(n / 10000), 1) |
| 30 | + latency_ms = int((time.time() - start_time) * 1000 / num_fetches), 1 |
| 31 | + print( |
| 32 | + "Fetched {} rows with an avg latency of {} per fetch, ".format( |
| 33 | + n, latency_ms |
43 | 34 | ) |
44 | | - |
45 | | - |
46 | | -class LargeWideResultSetMixin(LargeQueriesFetchMixin): |
47 | | - """Test mixin for large wide result set queries.""" |
48 | | - |
49 | | - @pytest.mark.parametrize( |
50 | | - "extra_params", |
51 | | - [ |
52 | | - {}, |
53 | | - {"use_sea": True}, |
54 | | - ], |
| 35 | + + "assuming 10K fetch size." |
55 | 36 | ) |
56 | | - @pytest.mark.parametrize("lz4_compression", [False, True]) |
57 | | - def test_query_with_large_wide_result_set(self, extra_params, lz4_compression): |
58 | | - resultSize = 100 * 1000 * 1000 # 100 MB |
59 | | - width = 8192 # B |
60 | | - rows = resultSize // width |
61 | | - cols = width // 36 |
62 | | - |
63 | | - # Set the fetchmany_size to get 10MB of data a go |
64 | | - fetchmany_size = 10 * 1024 * 1024 // width |
65 | | - # This is used by PyHive tests to determine the buffer size |
66 | | - self.arraysize = 1000 |
67 | | - with self.cursor(extra_params) as cursor: |
68 | | - cursor.connection.lz4_compression = lz4_compression |
69 | | - uuids = ", ".join(["uuid() uuid{}".format(i) for i in range(cols)]) |
70 | | - cursor.execute( |
71 | | - "SELECT id, {uuids} FROM RANGE({rows})".format( |
72 | | - uuids=uuids, rows=rows |
73 | | - ) |
74 | | - ) |
75 | | - assert lz4_compression == cursor.active_result_set.lz4_compressed |
76 | | - for row_id, row in enumerate( |
77 | | - self.fetch_rows(cursor, rows, fetchmany_size) |
78 | | - ): |
79 | | - assert row[0] == row_id # Verify no rows are dropped in the middle. |
80 | | - assert len(row[1]) == 36 |
81 | | - |
82 | | - |
83 | | -class LargeNarrowResultSetMixin(LargeQueriesFetchMixin): |
84 | | - """Test mixin for large narrow result set queries.""" |
85 | | - |
86 | | - @pytest.mark.parametrize( |
87 | | - "extra_params", |
88 | | - [ |
89 | | - {}, |
90 | | - {"use_sea": True}, |
91 | | - ], |
92 | | - ) |
93 | | - def test_query_with_large_narrow_result_set(self, extra_params): |
94 | | - resultSize = 100 * 1000 * 1000 # 100 MB |
95 | | - width = 8 # sizeof(long) |
96 | | - rows = resultSize / width |
97 | | - |
98 | | - # Set the fetchmany_size to get 10MB of data a go |
99 | | - fetchmany_size = 10 * 1024 * 1024 // width |
100 | | - # This is used by PyHive tests to determine the buffer size |
101 | | - self.arraysize = 10000000 |
102 | | - with self.cursor(extra_params) as cursor: |
103 | | - cursor.execute("SELECT * FROM RANGE({rows})".format(rows=rows)) |
104 | | - for row_id, row in enumerate(self.fetch_rows(cursor, rows, fetchmany_size)): |
105 | | - assert row[0] == row_id |
106 | | - |
107 | | - |
108 | | -class LongRunningQueryMixin: |
109 | | - """Test mixin for long running queries.""" |
110 | | - |
111 | | - @pytest.mark.parametrize( |
112 | | - "extra_params", |
113 | | - [ |
114 | | - {}, |
115 | | - {"use_sea": True}, |
116 | | - ], |
117 | | - ) |
118 | | - def test_long_running_query(self, extra_params): |
119 | | - """Incrementally increase query size until it takes at least 3 minutes, |
120 | | - and asserts that the query completes successfully. |
121 | | - """ |
122 | | - minutes = 60 |
123 | | - min_duration = 1 * minutes |
124 | | - |
125 | | - duration = -1 |
126 | | - scale0 = 10000 |
127 | | - scale_factor = 50 |
128 | | - with self.cursor(extra_params) as cursor: |
129 | | - while duration < min_duration: |
130 | | - assert scale_factor < 4096, "Detected infinite loop" |
131 | | - start = time.time() |
132 | | - |
133 | | - cursor.execute( |
134 | | - """SELECT count(*) |
135 | | - FROM RANGE({scale}) x |
136 | | - JOIN RANGE({scale0}) y |
137 | | - ON from_unixtime(x.id * y.id, "yyyy-MM-dd") LIKE "%not%a%date%" |
138 | | - """.format( |
139 | | - scale=scale_factor * scale0, scale0=scale0 |
140 | | - ) |
141 | | - ) |
142 | | - |
143 | | - (n,) = cursor.fetchone() |
144 | | - assert n == 0 |
145 | | - |
146 | | - duration = time.time() - start |
147 | | - current_fraction = duration / min_duration |
148 | | - print("Took {} s with scale factor={}".format(duration, scale_factor)) |
149 | | - # Extrapolate linearly to reach 3 min and add 50% padding to push over the limit |
150 | | - scale_factor = math.ceil(1.5 * scale_factor / current_fraction) |
151 | 37 |
|
152 | 38 |
|
153 | | -# Keep backward-compatible alias that combines all three |
154 | | -class LargeQueriesMixin(LargeWideResultSetMixin, LargeNarrowResultSetMixin, LongRunningQueryMixin): |
155 | | - pass |
| 39 | +def _get_some_rows(cursor, fetchmany_size): |
| 40 | + row = cursor.fetchone() |
| 41 | + if row: |
| 42 | + return [row] |
| 43 | + else: |
| 44 | + return None |
0 commit comments