Implement parser for Breath 4 leaderboards
Krzysztof Andrzej Sikorski

Krzysztof Andrzej Sikorski commited on 2022-04-24 23:05:30
Showing 4 changed files, with 325 additions and 0 deletions.

... ...
@@ -1,6 +1,9 @@
1 1
 # essential framework settings
2 2
 APP_ENV=dev
3 3
 APP_SECRET=
4
+APP_WORKER_PARSER_BATCH_SIZE=1
5
+APP_WORKER_PARSER_MAX_ITERATIONS=1
6
+APP_WORKER_PARSER_MAX_DURATION="1 second"
4 7
 
5 8
 # doctrine settings
6 9
 DATABASE_URL="postgresql://symfony:ChangeMe@127.0.0.1:5432/app?serverVersion=13&charset=utf8"
... ...
@@ -0,0 +1,11 @@
1
+<?php
2
+
3
+declare(strict_types=1);
4
+
5
+namespace App\Contract\Service\Parser;
6
+
7
+use RuntimeException;
8
+
9
+class ParserError extends RuntimeException
10
+{
11
+}
... ...
@@ -0,0 +1,282 @@
1
+<?php
2
+
3
+declare(strict_types=1);
4
+
5
+namespace App\Service\Parser;
6
+
7
+use App\Contract\Entity\LeaderboardTypes;
8
+use App\Contract\Entity\Nexus\GamePeriodIdEnum;
9
+use App\Contract\Entity\Nexus\LeaderboardInterface;
10
+use App\Contract\Service\Parser\ParserError;
11
+use App\Contract\Service\Parser\ParserInterface;
12
+use App\Contract\Service\Parser\ParserResultInterface;
13
+use App\Doctrine\Entity\Nexus\GamePeriod;
14
+use App\Doctrine\Entity\PageView;
15
+use App\DTO\Nexus\Leaderboard;
16
+use App\DTO\Nexus\Leaderboard\Entry;
17
+use App\DTO\ParserResult;
18
+use App\Service\Repository\Nexus\GamePeriodRepository;
19
+use DateTimeInterface;
20
+use Symfony\Component\DomCrawler\Crawler;
21
+
22
+use function array_key_exists;
23
+use function iconv;
24
+use function in_array;
25
+use function intval;
26
+use function is_string;
27
+use function mb_convert_case;
28
+use function mt_rand;
29
+use function parse_str;
30
+use function parse_url;
31
+use function preg_match;
32
+use function sprintf;
33
+use function substr;
34
+
35
+use const MB_CASE_LOWER;
36
+use const PHP_URL_PATH;
37
+use const PHP_URL_QUERY;
38
+
39
+final class BreathFourthParser implements ParserInterface
40
+{
41
+    private const EXPECTED_PATH = '/modules.php';
42
+    private const MODULE_ARG_NAME = 'name';
43
+    private const MODULE_ARG_VALUE_GAME = 'Game';
44
+    private const ACTION_ARG_NAME = 'op';
45
+    private const ACTION_ARG_VALUE_USE_ITEM = 'useitem';
46
+
47
+    private const REGEXP_META_CHARSET_TAG = '/<META[^<>]+(?P<attribute>charset=[^\s"\']+)[^<>]*>/i';
48
+    private const REGEXP_META_CHARSET_ATTRIBUTE = '/^charset=(?P<value>.+)/i';
49
+    private const REGEXP_MESSAGES_NEWSPAPER_USE = '/^\\s*-\\s+(\(\d+\\s+times\)\\s*)?You read the newspaper/i';
50
+    private const REGEXP_LEADERBOARD_TABLE_HEADER = '/^(?P<name>.+)\s+\((?P<type>[^()]+)\)$/i';
51
+    private const REGEXP_LEADERBOARD_ENTRY_HEADER = '/^(?P<position>\d+)\.\)\\s*(?P<characterName>.+)$/';
52
+
53
+    private const KNOWN_ITEM_USE_ERRORS = [
54
+        'You do not have enough action points to act.',
55
+        'Sorry, you do not own this item!',
56
+    ];
57
+
58
+    private GamePeriod $gamePeriod;
59
+    private DateTimeInterface $breathEnd;
60
+
61
+    public function __construct(GamePeriodRepository $gamePeriodRepository)
62
+    {
63
+        $gamePeriod = $gamePeriodRepository->findById(id: GamePeriodIdEnum::BREATH_4);
64
+        if (null === $gamePeriod) {
65
+            throw new \RuntimeException(message: 'Could not find B4 game period in database!');
66
+        }
67
+        $this->gamePeriod = $gamePeriod;
68
+        $this->breathEnd = $this->gamePeriod->getCompletedAt();
69
+    }
70
+
71
+    public function supports(PageView $pageView): bool
72
+    {
73
+        return $this->breathEnd >= $pageView->getRequestStartedAt();
74
+    }
75
+
76
+    public function parse(PageView $pageView): ParserResultInterface
77
+    {
78
+        $result = new ParserResult();
79
+        $result->setGamePeriod(gamePeriod: $this->gamePeriod);
80
+
81
+        if ($this->isNotItemUse(pageView: $pageView)) {
82
+            return $result;
83
+        }
84
+
85
+        $responseBody = $pageView->getResponseBody();
86
+        if (empty($responseBody)) {
87
+            throw new ParserError(message: 'Stored response body is empty');
88
+        }
89
+        $encoding = $this->detectEncoding(responseBody: $responseBody);
90
+        $responseBody = iconv(from_encoding: $encoding, to_encoding: 'UTF-8', string: $responseBody);
91
+
92
+        $leaderboard = $this->getLeaderboardFromResponseBody(responseBody: $responseBody);
93
+
94
+        $result->setLeaderboard(leaderboard: $leaderboard);
95
+
96
+        return $result;
97
+    }
98
+
99
+    private function isNotItemUse(PageView $pageView): bool
100
+    {
101
+        // extract path and query params from URL
102
+        $url = $pageView->getUrl();
103
+        $path = parse_url(url: $url, component: PHP_URL_PATH);
104
+        $queryStr = parse_url(url: $url, component: PHP_URL_QUERY);
105
+        $queryParams = [];
106
+        if (is_string(value: $queryStr)) {
107
+            parse_str(string: $queryStr, result: $queryParams);
108
+        }
109
+
110
+        // check URL path
111
+        if (self::EXPECTED_PATH !== $path) {
112
+            return true;
113
+        }
114
+
115
+        // check module name
116
+        if (
117
+            false === array_key_exists(key: self::MODULE_ARG_NAME, array: $queryParams)
118
+            || self::MODULE_ARG_VALUE_GAME !== $queryParams[self::MODULE_ARG_NAME]
119
+        ) {
120
+            return true;
121
+        }
122
+
123
+        // check action type
124
+        if (
125
+            false === array_key_exists(key: self::ACTION_ARG_NAME, array: $queryParams)
126
+            || self::ACTION_ARG_VALUE_USE_ITEM !== $queryParams[self::ACTION_ARG_NAME]
127
+        ) {
128
+            return true;
129
+        }
130
+
131
+        return false;
132
+    }
133
+
134
+    private function detectEncoding(string $responseBody): string
135
+    {
136
+        $htmlSample = substr(string: $responseBody, offset: 0, length: 512);
137
+
138
+        // find charset declaration
139
+        $matches = [];
140
+        preg_match(pattern: self::REGEXP_META_CHARSET_TAG, subject: $htmlSample, matches: $matches);
141
+        if (false === array_key_exists(key: 'attribute', array: $matches)) {
142
+            throw new ParserError(message: 'Could not find META CHARSET element in page source');
143
+        }
144
+        $charsetAttribute = $matches['attribute'];
145
+
146
+        // find charset value
147
+        $matches = [];
148
+        preg_match(pattern: self::REGEXP_META_CHARSET_ATTRIBUTE, subject: $charsetAttribute, matches: $matches);
149
+        if (false === array_key_exists(key: 'value', array: $matches)) {
150
+            throw new ParserError(message: 'Could not get CHARSET value from META CHARSET element');
151
+        }
152
+
153
+        return $matches['value'];
154
+    }
155
+
156
+    private function getLeaderboardFromResponseBody(string $responseBody): ?Leaderboard
157
+    {
158
+        $crawler = new Crawler(node: $responseBody);
159
+
160
+        // search for messages section
161
+        $messagesCrawler = $crawler->filter(selector: '#Messages');
162
+        if (0 === $messagesCrawler->count()) {
163
+            return null;
164
+        }
165
+
166
+        // check if last message is newspaper use
167
+        $messages = $messagesCrawler->html();
168
+        $pregMatchResult = preg_match(pattern: self::REGEXP_MESSAGES_NEWSPAPER_USE, subject: $messages);
169
+        if (1 !== $pregMatchResult) {
170
+            return null;
171
+        }
172
+
173
+        // search for errors section (for some reason it is also sometimes reused to show item use results)
174
+        $errorsCrawler = $crawler->filter(selector: '#Errors');
175
+        if (0 === $errorsCrawler->count()) {
176
+            return null;
177
+        }
178
+
179
+        // check if errors section contain an actual error message
180
+        if (in_array(needle: $errorsCrawler->text(), haystack: self::KNOWN_ITEM_USE_ERRORS, strict: true)) {
181
+            return null;
182
+        }
183
+
184
+        // search for use result section (for some reason labelled "Errors") and leaderboard table inside
185
+        $leaderboardTableCrawler = $crawler->filter(selector: '#Errors table')->eq(position: 1);
186
+        if (0 === $leaderboardTableCrawler->count()) {
187
+            throw new ParserError(message: 'Could not find leaderboard table, despite being newspaper use page');
188
+        }
189
+
190
+        // parse table headers
191
+        $leaderboardName = '';
192
+        $leaderboardType = '';
193
+        $scoreLabel = '';
194
+        $tableHeadersCrawler = $leaderboardTableCrawler->filter(selector: 'th');
195
+        if (3 > $tableHeadersCrawler->count()) {
196
+            throw new ParserError(message: 'Missing headers in leaderboard table');
197
+        }
198
+        foreach ($tableHeadersCrawler as $nodeIndex => $node) {
199
+            if (0 === $nodeIndex) {
200
+                $leaderboardHeader = $node->textContent;
201
+                $matches = [];
202
+                preg_match(
203
+                    pattern: self::REGEXP_LEADERBOARD_TABLE_HEADER,
204
+                    subject: $leaderboardHeader,
205
+                    matches: $matches
206
+                );
207
+                if (false === isset($matches['name'], $matches['type'])) {
208
+                    throw new ParserError(message: 'Could not find name and type in leaderboard header');
209
+                }
210
+                $leaderboardName = $matches['name'];
211
+                $leaderboardType = mb_convert_case(string: $matches['type'], mode: MB_CASE_LOWER, encoding: 'UTF-8');
212
+            } elseif (2 === $nodeIndex) {
213
+                $scoreLabel = $node->textContent;
214
+            }
215
+        }
216
+
217
+        // parse entry rows
218
+        $entryRows = [];
219
+        $tableRowsCrawler = $leaderboardTableCrawler->filter(selector: 'tr');
220
+        $tableRowsCount = $tableRowsCrawler->count();
221
+        if (3 > $tableHeadersCrawler->count()) {
222
+            throw new ParserError(message: 'Missing entry rows in leaderboard table');
223
+        }
224
+        for ($rowIndex = 2; $rowIndex < $tableRowsCount; $rowIndex++) {
225
+            $rowCellsCrawler = $tableRowsCrawler->eq(position: $rowIndex)->filter(selector: 'td');
226
+            if (2 > $rowCellsCrawler->count()) {
227
+                throw new ParserError(message: 'Not enough cells in entry row');
228
+            }
229
+            $entryHeader = $rowCellsCrawler->getNode(position: 0)->textContent;
230
+            $entryScoreStr = $rowCellsCrawler->getNode(position: 1)->textContent;
231
+            $matches = [];
232
+            preg_match(pattern: self::REGEXP_LEADERBOARD_ENTRY_HEADER, subject: $entryHeader, matches: $matches);
233
+            if (false === isset($matches['position'], $matches['characterName'])) {
234
+                throw new ParserError(message: 'Could not find name and type in leaderboard header');
235
+            }
236
+            $entryRows[] = [
237
+                'position' => intval(value: $matches['position']),
238
+                'characterName' => $matches['characterName'],
239
+                'score' => intval(value: $entryScoreStr),
240
+            ];
241
+        }
242
+
243
+        // build leaderboard instance
244
+        $leaderboard = new Leaderboard();
245
+        $leaderboard->setName(name: $leaderboardName);
246
+        $leaderboard->setType(type: $leaderboardType);
247
+        $leaderboard->setScoreLabel(scoreLabel: $scoreLabel);
248
+        $leaderboardEntries = $leaderboard->getEntries();
249
+        foreach ($entryRows as $entryRow) {
250
+            $entry = new Entry();
251
+            $entry->setCharacterName(characterName: $entryRow['characterName']);
252
+            $entry->setScore(value: $entryRow['score']);
253
+            $leaderboardEntries[$entryRow['position']] = $entry;
254
+        }
255
+
256
+        return $leaderboard;
257
+    }
258
+
259
+    private function generateDummyLeaderboard(): LeaderboardInterface
260
+    {
261
+        $leaderboard = new Leaderboard();
262
+        $leaderboard->setName(name: 'Dummy leaderboard');
263
+        $leaderboard->setType(type: LeaderboardTypes::CAREER);
264
+        $leaderboard->setScoreLabel(scoreLabel: 'Dummy score');
265
+
266
+        $entries = $leaderboard->getEntries();
267
+
268
+        for ($position = 1; $position <= 10; $position++) {
269
+            $characterName = sprintf('Dummy #%d', $position);
270
+            $minScore = (10 - $position) * 100;
271
+            $score = mt_rand(min: $minScore, max: $minScore + 99);
272
+
273
+            $entry = new Entry();
274
+            $entry->setCharacterName(characterName: $characterName);
275
+            $entry->setScore(value: $score);
276
+
277
+            $entries[$position] = $entry;
278
+        }
279
+
280
+        return $leaderboard;
281
+    }
282
+}
... ...
@@ -0,0 +1,29 @@
1
+<?php
2
+
3
+declare(strict_types=1);
4
+
5
+namespace App\Service\Repository\Nexus;
6
+
7
+use App\Doctrine\Entity\Nexus\GamePeriod;
8
+use Doctrine\ORM\EntityManagerInterface;
9
+
10
+final class GamePeriodRepository
11
+{
12
+    public function __construct(
13
+        private EntityManagerInterface $entityManager,
14
+    ) {
15
+    }
16
+
17
+    public function findById(int $id): ?GamePeriod
18
+    {
19
+        $queryBuilder = $this->entityManager->createQueryBuilder()
20
+            ->select(select: 'gp')
21
+            ->from(from: GamePeriod::class, alias: 'gp')
22
+            ->where(predicates: 'gp.id = :id')
23
+            ->setParameter(key: 'id', value: $id);
24
+
25
+        $query = $queryBuilder->getQuery();
26
+
27
+        return $query->getOneOrNullResult();
28
+    }
29
+}
0 30