Theodosius v3.0
Jit linker, symbol mapper, and obfuscator
decomp.cpp
Go to the documentation of this file.
1// Copyright (c) 2022, _xeroxz
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are met:
6//
7// 1. Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9//
10// 2. Redistributions in binary form must reproduce the above copyright notice,
11// this list of conditions and the following disclaimer in the documentation
12// and/or other materials provided with the distribution.
13//
14// 3. Neither the name of the copyright holder nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28// POSSIBILITY OF SUCH DAMAGE.
29//
30
31#include <decomp/decomp.hpp>
32
33namespace theo::decomp {
34decomp_t::decomp_t(std::vector<std::uint8_t>& lib, recomp::symbol_table_t* syms)
35 : m_lib(lib), m_syms(syms) {}
36
37std::optional<recomp::symbol_table_t*> decomp_t::decompose(
38 std::string& entry_sym) {
39 // extract obj files from the archive file...
40 //
41 ar::view<false> lib(m_lib.data(), m_lib.size());
42 std::for_each(
43 lib.begin(), lib.end(),
44 [&](std::pair<std::string_view, ar::entry_t&> itr) {
45 // if the entry isnt the symbol table or the string table
46 // then we know its an obj file...
47 //
48 if (!itr.second.is_symbol_table() && !itr.second.is_string_table()) {
49 spdlog::info("extracted obj from archive: {}", itr.first);
50 std::vector<std::uint8_t> data(itr.second.begin(), itr.second.end());
51 m_objs.push_back(data);
52 }
53 });
54
55 std::for_each(
56 m_objs.begin(), m_objs.end(), [&](std::vector<std::uint8_t>& img_data) {
57 auto img = reinterpret_cast<coff::image_t*>(img_data.data());
58 for (auto idx = 0u; idx < img->file_header.num_symbols; ++idx) {
59 auto sym = img->get_symbol(idx);
60 if (sym->section_index - 1 > img->file_header.num_sections)
61 continue;
62
63 auto sym_name = symbol_t::name(img, sym);
64 if (sym_name.length()) {
65 auto sym_hash = symbol_t::hash(sym_name.data());
66 auto sym_size =
67 sym->has_section()
68 ? next_sym(img, img->get_section(sym->section_index - 1),
69 sym)
70 : 0u;
71
72 m_lookup_tbl[sym_hash].emplace_back(img, sym, sym_size);
73 }
74 }
75 });
76
77 // extract used symbols from objs and create a nice little set of them so that
78 // we can easily decompose them... no need deal with every single symbol...
79 spdlog::info("extracted {} symbols being used...",
80 ext_used_syms(entry_sym.data()));
81
82 // generate symbols, populate section hash table, for each object file
83 // extracted from the archive file...
84 //
85 std::for_each(m_used_syms.begin(), m_used_syms.end(), [&](sym_data_t data) {
86 auto [img, sym, size] = data;
87
88 // populate section hash table with sections for the img of this
89 // symbol... only populate the hash table if its not been populated for
90 // this obj before...
91 //
92 if (m_processed_objs.emplace(img).second) {
93 for (auto idx = 0u; idx < img->file_header.num_sections; ++idx) {
94 auto scn = img->get_section(idx);
95 auto scn_sym_name =
96 std::string(scn->name.to_string(img->get_strings()))
97 .append("#")
98 .append(std::to_string(idx))
99 .append("!")
100 .append(std::to_string(img->file_header.timedate_stamp));
101
102 // hash the name of the section + the index + the timestamp of the
103 // obj file it is in...
104 //
105 m_scn_hash_tbl.insert({scn, decomp::symbol_t::hash(scn_sym_name)});
106 }
107 }
108
109 // if the symbol is a function then we are going to decompose it...
110 // data symbols are handled after this...
111 //
112 if (sym->has_section()) {
113 if (sym->derived_type == coff::derived_type_id::function) {
114 auto scn = img->get_section(sym->section_index - 1);
115 auto dcmp_type =
116 scn->name.to_string(img->get_strings()) == INSTR_SPLIT_SECTION_NAME
117 ? decomp::sym_type_t::instruction
118 : decomp::sym_type_t::function;
119
120 auto fn_size = next_sym(img, scn, sym);
121 auto fn_bgn = scn->ptr_raw_data + reinterpret_cast<std::uint8_t*>(img) +
122 sym->value;
123
124 std::vector<std::uint8_t> fn(fn_bgn, fn_bgn + fn_size);
125 decomp::routine_t rtn(sym, img, scn, fn, dcmp_type);
126
127 auto syms = rtn.decompose();
128 m_syms->put_symbols(syms);
129 } else if (sym->storage_class == coff::storage_class_id::public_symbol ||
130 sym->storage_class == coff::storage_class_id::private_symbol) {
131 auto scn = img->get_section(sym->section_index - 1);
132 auto scn_sym = m_syms->sym_from_hash(m_scn_hash_tbl[scn]);
133
134 // if the section doesnt have a symbol then make one and put it into
135 // the symbol table...
136 //
137 if (!scn_sym.has_value()) {
138 auto scn_sym_name =
139 std::string(scn->name.to_string(img->get_strings()))
140 .append("#")
141 .append(std::to_string(sym->section_index - 1))
142 .append("!")
143 .append(std::to_string(img->file_header.timedate_stamp));
144
145 std::vector<std::uint8_t> scn_data(scn->size_raw_data);
146 if (scn->characteristics.cnt_uninit_data) {
147 scn_data.insert(scn_data.begin(), scn->size_raw_data, 0);
148 } else {
149 scn_data.insert(
150 scn_data.begin(),
151 reinterpret_cast<std::uint8_t*>(img) + scn->ptr_raw_data,
152 reinterpret_cast<std::uint8_t*>(img) + scn->ptr_raw_data +
153 scn->size_raw_data);
154 }
155
156 std::vector<recomp::reloc_t> relocs;
157 auto scn_relocs = reinterpret_cast<coff::reloc_t*>(
158 scn->ptr_relocs + reinterpret_cast<std::uint8_t*>(img));
159
160 for (auto idx = 0u; idx < scn->num_relocs; ++idx) {
161 auto scn_reloc = &scn_relocs[idx];
162 auto sym_reloc = img->get_symbol(scn_relocs[idx].symbol_index);
163 auto sym_name = symbol_t::name(img, sym_reloc);
164 auto sym_hash = decomp::symbol_t::hash(sym_name.data());
165 relocs.push_back(
166 recomp::reloc_t(scn_reloc->virtual_address - sym->value,
167 sym_hash, sym_name.data()));
168 }
169
170 decomp::symbol_t new_scn_sym(img, scn_sym_name, 0, scn_data, scn, {},
171 relocs, sym_type_t::section);
172
173 m_syms->put_symbol(new_scn_sym);
174 }
175
176 // create a symbol for the data...
177 //
178 decomp::symbol_t new_sym(img, symbol_t::name(img, sym).data(),
179 sym->value, {}, scn, sym, {},
181
182 m_syms->put_symbol(new_sym);
183 }
184 } else if (sym->storage_class ==
185 coff::storage_class_id::
186 external_definition) { // else if the symbol has no
187 // section... these symbols
188 // require the linker to allocate
189 // space for them...
190
191 std::vector<std::uint8_t> data(sym->value, 0);
192 decomp::symbol_t bss_sym(img, symbol_t::name(img, sym).data(), {}, data,
193 {}, sym, {}, sym_type_t::data);
194
195 m_syms->put_symbol(bss_sym);
196 }
197 });
198
199 // return the extract symbols to the caller...
200 //
201 return m_syms;
202}
203
204std::uint32_t decomp_t::next_sym(coff::image_t* img,
205 coff::section_header_t* hdr,
206 coff::symbol_t* s) {
207 // loop over all symbols in this object...
208 // find the next symbol inside of the same section...
209 // if there is no next symbol then we use the end of the section...
210 std::uint32_t res = hdr->size_raw_data;
211 for (auto idx = 0u; idx < img->file_header.num_symbols; ++idx) {
212 auto q = img->get_symbol(idx);
213 if (q->derived_type == coff::derived_type_id::function &&
214 q->section_index == s->section_index && q != s)
215 if (q->value > s->value && q->value < res)
216 res = q->value;
217 }
218 return res;
219}
220
221std::uint32_t decomp_t::ext_used_syms(const std::string&& entry_sym) {
222 std::optional<sym_data_t> entry = get_symbol(entry_sym.data());
223 if (!entry.has_value())
224 return 0u;
225
226 std::set<coff::symbol_t*> cache;
227 const auto finding_syms = [&]() -> bool {
228 for (auto itr = m_used_syms.begin(); itr != m_used_syms.end(); ++itr) {
229 auto [img, sym, size] = *itr;
230 if (sym->has_section() && !cache.count(sym) && size) {
231 auto scn = img->get_section(sym->section_index - 1);
232 auto num_relocs = scn->num_relocs;
233 auto relocs = reinterpret_cast<coff::reloc_t*>(
234 scn->ptr_relocs + reinterpret_cast<std::uint8_t*>(img));
235
236 for (auto idx = 0u; idx < num_relocs; ++idx) {
237 auto reloc = &relocs[idx];
238 // if the reloc is inside of the current symbol...
239 if (reloc->virtual_address >= sym->value &&
240 reloc->virtual_address < sym->value + size) {
241 auto reloc_sym = img->get_symbol(reloc->symbol_index);
242 auto sym_name = symbol_t::name(img, reloc_sym);
243 entry = get_symbol(sym_name);
244 if (m_used_syms.emplace(entry.value()).second)
245 return true;
246 }
247 }
248 cache.emplace(sym);
249 }
250 }
251 return false;
252 };
253
254 m_used_syms.emplace(entry.value());
255 for (m_used_syms.emplace(entry.value()); finding_syms();)
256 ;
257
258 return m_used_syms.size();
259}
260
261std::optional<sym_data_t> decomp_t::get_symbol(const std::string_view& name) {
262 coff::image_t* img = {};
263 coff::symbol_t* sym = {};
264 std::uint32_t size = {};
265
266 auto& syms = m_lookup_tbl[symbol_t::hash(name.data())];
267 for (auto idx = 0u; idx < syms.size(); ++idx) {
268 img = std::get<0>(syms[idx]);
269 sym = std::get<1>(syms[idx]);
270 size = std::get<2>(syms[idx]);
271 if (sym->has_section())
272 return {{img, sym, size}};
273 }
274
275 if (img && sym)
276 return {{img, sym, size}};
277
278 return {};
279}
280
281std::vector<routine_t> decomp_t::rtns() {
282 return m_rtns;
283}
284
285std::vector<std::uint8_t> decomp_t::lib() {
286 return m_lib;
287}
288
289std::vector<std::vector<std::uint8_t>> decomp_t::objs() {
290 return m_objs;
291}
292
293recomp::symbol_table_t* decomp_t::syms() {
294 return m_syms;
295}
296
297std::map<coff::section_header_t*, std::size_t>& decomp_t::scn_hash_tbl() {
298 return m_scn_hash_tbl;
299}
300} // namespace theo::decomp