Theodosius  v3.0
Jit linker, mapper, obfuscator, and mutator
decomp.cpp
Go to the documentation of this file.
1 // Copyright (c) 2022, _xeroxz
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 // 1. Redistributions of source code must retain the above copyright notice,
8 // this list of conditions and the following disclaimer.
9 //
10 // 2. Redistributions in binary form must reproduce the above copyright notice,
11 // this list of conditions and the following disclaimer in the documentation
12 // and/or other materials provided with the distribution.
13 //
14 // 3. Neither the name of the copyright holder nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 // POSSIBILITY OF SUCH DAMAGE.
29 //
30 
31 #include <decomp/decomp.hpp>
32 
33 namespace theo::decomp {
34 decomp_t::decomp_t(std::vector<std::uint8_t>& lib, recomp::symbol_table_t* syms)
35  : m_lib(lib), m_syms(syms) {}
36 
37 std::optional<recomp::symbol_table_t*> decomp_t::decompose(
38  std::string& entry_sym) {
39  // extract obj files from the archive file...
40  //
41  ar::view<false> lib(m_lib.data(), m_lib.size());
42  std::for_each(
43  lib.begin(), lib.end(),
44  [&](std::pair<std::string_view, ar::entry_t&> itr) {
45  // if the entry isnt the symbol table or the string table
46  // then we know its an obj file...
47  //
48  if (!itr.second.is_symbol_table() && !itr.second.is_string_table()) {
49  spdlog::info("extracted obj from archive: {}", itr.first);
50  std::vector<std::uint8_t> data(itr.second.begin(), itr.second.end());
51  m_objs.push_back(data);
52  }
53  });
54 
55  std::for_each(
56  m_objs.begin(), m_objs.end(), [&](std::vector<std::uint8_t>& img_data) {
57  auto img = reinterpret_cast<coff::image_t*>(img_data.data());
58  for (auto idx = 0u; idx < img->file_header.num_symbols; ++idx) {
59  auto sym = img->get_symbol(idx);
60  if (sym->section_index - 1 > img->file_header.num_sections)
61  continue;
62 
63  auto sym_name = symbol_t::name(img, sym);
64  if (sym_name.length()) {
65  auto sym_hash = symbol_t::hash(sym_name.data());
66  auto sym_size =
67  sym->has_section()
68  ? next_sym(img, img->get_section(sym->section_index - 1),
69  sym)
70  : 0u;
71 
72  m_lookup_tbl[sym_hash].emplace_back(img, sym, sym_size);
73  }
74  }
75  });
76 
77  // extract used symbols from objs and create a nice little set of them so that
78  // we can easily decompose them... no need deal with every single symbol...
79  spdlog::info("extracted {} symbols being used...",
80  ext_used_syms(entry_sym.data()));
81 
82  // generate symbols, populate section hash table, for each object file
83  // extracted from the archive file...
84  //
85  std::for_each(m_used_syms.begin(), m_used_syms.end(), [&](sym_data_t data) {
86  auto [img, sym, size] = data;
87 
88  // populate section hash table with sections for the img of this
89  // symbol... only populate the hash table if its not been populated for
90  // this obj before...
91  //
92  if (m_processed_objs.emplace(img).second) {
93  for (auto idx = 0u; idx < img->file_header.num_sections; ++idx) {
94  auto scn = img->get_section(idx);
95  auto scn_sym_name =
96  std::string(scn->name.to_string(img->get_strings()))
97  .append("#")
98  .append(std::to_string(idx))
99  .append("!")
100  .append(std::to_string(img->file_header.timedate_stamp));
101 
102  // hash the name of the section + the index + the timestamp of the
103  // obj file it is in...
104  //
105  m_scn_hash_tbl.insert({scn, decomp::symbol_t::hash(scn_sym_name)});
106  }
107  }
108 
109  // if the symbol is a function then we are going to decompose it...
110  // data symbols are handled after this...
111  //
112  if (sym->has_section()) {
113  if (sym->derived_type == coff::derived_type_id::function) {
114  auto scn = img->get_section(sym->section_index - 1);
115  auto dcmp_type =
116  scn->name.to_string(img->get_strings()) == INSTR_SPLIT_SECTION_NAME
117  ? decomp::sym_type_t::instruction
118  : decomp::sym_type_t::function;
119 
120  auto fn_size = next_sym(img, scn, sym);
121  auto fn_bgn = scn->ptr_raw_data + reinterpret_cast<std::uint8_t*>(img) +
122  sym->value;
123 
124  std::vector<std::uint8_t> fn(fn_bgn, fn_bgn + fn_size);
125  decomp::routine_t rtn(sym, img, scn, fn, dcmp_type);
126 
127  auto syms = rtn.decompose();
128  m_syms->put_symbols(syms);
129  } else if (sym->storage_class == coff::storage_class_id::public_symbol ||
130  sym->storage_class == coff::storage_class_id::private_symbol) {
131  auto scn = img->get_section(sym->section_index - 1);
132  auto scn_sym = m_syms->sym_from_hash(m_scn_hash_tbl[scn]);
133 
134  // if the section doesnt have a symbol then make one and put it into
135  // the symbol table...
136  //
137  if (!scn_sym.has_value()) {
138  auto scn_sym_name =
139  std::string(scn->name.to_string(img->get_strings()))
140  .append("#")
141  .append(std::to_string(sym->section_index - 1))
142  .append("!")
143  .append(std::to_string(img->file_header.timedate_stamp));
144 
145  std::vector<std::uint8_t> scn_data(scn->size_raw_data);
146  if (scn->characteristics.cnt_uninit_data) {
147  scn_data.insert(scn_data.begin(), scn->size_raw_data, 0);
148  } else {
149  scn_data.insert(
150  scn_data.begin(),
151  reinterpret_cast<std::uint8_t*>(img) + scn->ptr_raw_data,
152  reinterpret_cast<std::uint8_t*>(img) + scn->ptr_raw_data +
153  scn->size_raw_data);
154  }
155 
156  std::vector<recomp::reloc_t> relocs;
157  auto scn_relocs = reinterpret_cast<coff::reloc_t*>(
158  scn->ptr_relocs + reinterpret_cast<std::uint8_t*>(img));
159 
160  for (auto idx = 0u; idx < scn->num_relocs; ++idx) {
161  auto scn_reloc = &scn_relocs[idx];
162  auto sym_reloc = img->get_symbol(scn_relocs[idx].symbol_index);
163  auto sym_name = symbol_t::name(img, sym_reloc);
164  auto sym_hash = decomp::symbol_t::hash(sym_name.data());
165  relocs.push_back(
166  recomp::reloc_t(scn_reloc->virtual_address - sym->value,
167  sym_hash, sym_name.data()));
168  }
169 
170  decomp::symbol_t new_scn_sym(img, scn_sym_name, 0, scn_data, scn, {},
171  relocs, sym_type_t::section);
172 
173  m_syms->put_symbol(new_scn_sym);
174  }
175 
176  // create a symbol for the data...
177  //
178  decomp::symbol_t new_sym(img, symbol_t::name(img, sym).data(),
179  sym->value, {}, scn, sym, {},
181 
182  m_syms->put_symbol(new_sym);
183  }
184  } else if (sym->storage_class ==
185  coff::storage_class_id::
186  external_definition) { // else if the symbol has no
187  // section... these symbols
188  // require the linker to allocate
189  // space for them...
190 
191  std::vector<std::uint8_t> data(sym->value, 0);
192  decomp::symbol_t bss_sym(img, symbol_t::name(img, sym).data(), {}, data,
193  {}, sym, {}, sym_type_t::data);
194 
195  m_syms->put_symbol(bss_sym);
196  }
197  });
198 
199  // return the extract symbols to the caller...
200  //
201  return m_syms;
202 }
203 
204 std::uint32_t decomp_t::next_sym(coff::image_t* img,
205  coff::section_header_t* hdr,
206  coff::symbol_t* s) {
207  // loop over all symbols in this object...
208  // find the next symbol inside of the same section...
209  // if there is no next symbol then we use the end of the section...
210  std::uint32_t res = hdr->size_raw_data;
211  for (auto idx = 0u; idx < img->file_header.num_symbols; ++idx) {
212  auto q = img->get_symbol(idx);
213  if (q->derived_type == coff::derived_type_id::function &&
214  q->section_index == s->section_index && q != s)
215  if (q->value > s->value && q->value < res)
216  res = q->value;
217  }
218  return res;
219 }
220 
221 std::uint32_t decomp_t::ext_used_syms(const std::string&& entry_sym) {
222  std::optional<sym_data_t> entry = get_symbol(entry_sym.data());
223  if (!entry.has_value())
224  return 0u;
225 
226  std::set<coff::symbol_t*> cache;
227  const auto finding_syms = [&]() -> bool {
228  for (auto itr = m_used_syms.begin(); itr != m_used_syms.end(); ++itr) {
229  auto [img, sym, size] = *itr;
230  if (sym->has_section() && !cache.count(sym) && size) {
231  auto scn = img->get_section(sym->section_index - 1);
232  auto num_relocs = scn->num_relocs;
233  auto relocs = reinterpret_cast<coff::reloc_t*>(
234  scn->ptr_relocs + reinterpret_cast<std::uint8_t*>(img));
235 
236  for (auto idx = 0u; idx < num_relocs; ++idx) {
237  auto reloc = &relocs[idx];
238  // if the reloc is inside of the current symbol...
239  if (reloc->virtual_address >= sym->value &&
240  reloc->virtual_address < sym->value + size) {
241  auto reloc_sym = img->get_symbol(reloc->symbol_index);
242  auto sym_name = symbol_t::name(img, reloc_sym);
243  entry = get_symbol(sym_name);
244  if (m_used_syms.emplace(entry.value()).second)
245  return true;
246  }
247  }
248  cache.emplace(sym);
249  }
250  }
251  return false;
252  };
253 
254  m_used_syms.emplace(entry.value());
255  for (m_used_syms.emplace(entry.value()); finding_syms();)
256  ;
257 
258  return m_used_syms.size();
259 }
260 
261 std::optional<sym_data_t> decomp_t::get_symbol(const std::string_view& name) {
262  coff::image_t* img = {};
263  coff::symbol_t* sym = {};
264  std::uint32_t size = {};
265 
266  auto& syms = m_lookup_tbl[symbol_t::hash(name.data())];
267  for (auto idx = 0u; idx < syms.size(); ++idx) {
268  img = std::get<0>(syms[idx]);
269  sym = std::get<1>(syms[idx]);
270  size = std::get<2>(syms[idx]);
271  if (sym->has_section())
272  return {{img, sym, size}};
273  }
274 
275  if (img && sym)
276  return {{img, sym, size}};
277 
278  return {};
279 }
280 
281 std::vector<routine_t> decomp_t::rtns() {
282  return m_rtns;
283 }
284 
285 std::vector<std::uint8_t> decomp_t::lib() {
286  return m_lib;
287 }
288 
289 std::vector<std::vector<std::uint8_t>> decomp_t::objs() {
290  return m_objs;
291 }
292 
293 recomp::symbol_table_t* decomp_t::syms() {
294  return m_syms;
295 }
296 
297 std::map<coff::section_header_t*, std::size_t>& decomp_t::scn_hash_tbl() {
298  return m_scn_hash_tbl;
299 }
300 } // namespace theo::decomp
decomp_t(std::vector< std::uint8_t > &lib, recomp::symbol_table_t *syms)
the explicit constructor for decomp_t
Definition: decomp.cpp:34
std::vector< std::uint8_t > lib()
gets a vector of bytes consisting of the lib file.
Definition: decomp.cpp:285
std::optional< recomp::symbol_table_t * > decompose(std::string &entry_sym)
decomposes (extracts) the symbols used. this function determines all used symbols given the entry poi...
Definition: decomp.cpp:37
symbol_t is an abstraction upon the coff symbol. this allows for easier manipulation of the symbol....
Definition: symbol.hpp:53
this class is a high level wrapper for a hashmap that contains decomp::symbol_t values....
the namespace that contains all of the decomposition related code.
Definition: decomp.hpp:49
std::tuple< coff::image_t *, coff::symbol_t *, std::uint32_t > sym_data_t
meta symbol data. consists of the coff image which contains the coff symbol, the coff symbol itself,...
Definition: decomp.hpp:55